xref: /llvm-project/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX803 %s
4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
5
6
7define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
8; GFX9-LABEL: s_pack_v2i16:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
11; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
13; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
16; GFX9-NEXT:    ;;#ASMSTART
17; GFX9-NEXT:    ; use s0
18; GFX9-NEXT:    ;;#ASMEND
19; GFX9-NEXT:    s_endpgm
20;
21; GFX803-LABEL: s_pack_v2i16:
22; GFX803:       ; %bb.0:
23; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
24; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
26; GFX803-NEXT:    s_load_dword s1, s[2:3], 0x0
27; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX803-NEXT:    s_and_b32 s0, s0, 0xffff
29; GFX803-NEXT:    s_lshl_b32 s1, s1, 16
30; GFX803-NEXT:    s_or_b32 s0, s0, s1
31; GFX803-NEXT:    ;;#ASMSTART
32; GFX803-NEXT:    ; use s0
33; GFX803-NEXT:    ;;#ASMEND
34; GFX803-NEXT:    s_endpgm
35;
36; GFX7-LABEL: s_pack_v2i16:
37; GFX7:       ; %bb.0:
38; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
39; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
41; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
42; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
44; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
45; GFX7-NEXT:    s_or_b32 s0, s0, s1
46; GFX7-NEXT:    ;;#ASMSTART
47; GFX7-NEXT:    ; use s0
48; GFX7-NEXT:    ;;#ASMEND
49; GFX7-NEXT:    s_endpgm
50  %val0 = load volatile i32, ptr addrspace(4) %in0
51  %val1 = load volatile i32, ptr addrspace(4) %in1
52  %lo = trunc i32 %val0 to i16
53  %hi = trunc i32 %val1 to i16
54  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
55  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
56  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
57
58  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
59  ret void
60}
61
62define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 {
63; GFX9-LABEL: s_pack_v2i16_imm_lo:
64; GFX9:       ; %bb.0:
65; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
66; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
68; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX9-NEXT:    s_pack_ll_b32_b16 s0, 0x1c8, s0
70; GFX9-NEXT:    ;;#ASMSTART
71; GFX9-NEXT:    ; use s0
72; GFX9-NEXT:    ;;#ASMEND
73; GFX9-NEXT:    s_endpgm
74;
75; GFX803-LABEL: s_pack_v2i16_imm_lo:
76; GFX803:       ; %bb.0:
77; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
78; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
80; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX803-NEXT:    s_lshl_b32 s0, s0, 16
82; GFX803-NEXT:    s_or_b32 s0, s0, 0x1c8
83; GFX803-NEXT:    ;;#ASMSTART
84; GFX803-NEXT:    ; use s0
85; GFX803-NEXT:    ;;#ASMEND
86; GFX803-NEXT:    s_endpgm
87;
88; GFX7-LABEL: s_pack_v2i16_imm_lo:
89; GFX7:       ; %bb.0:
90; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
91; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
93; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
95; GFX7-NEXT:    s_or_b32 s0, s0, 0x1c8
96; GFX7-NEXT:    ;;#ASMSTART
97; GFX7-NEXT:    ; use s0
98; GFX7-NEXT:    ;;#ASMEND
99; GFX7-NEXT:    s_endpgm
100  %val1 = load i32, ptr addrspace(4) %in1
101  %hi = trunc i32 %val1 to i16
102  %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
103  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
104  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
105
106  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
107  ret void
108}
109
110define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 {
111; GFX9-LABEL: s_pack_v2i16_imm_hi:
112; GFX9:       ; %bb.0:
113; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
114; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
116; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x1c8
118; GFX9-NEXT:    ;;#ASMSTART
119; GFX9-NEXT:    ; use s0
120; GFX9-NEXT:    ;;#ASMEND
121; GFX9-NEXT:    s_endpgm
122;
123; GFX803-LABEL: s_pack_v2i16_imm_hi:
124; GFX803:       ; %bb.0:
125; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
126; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
128; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX803-NEXT:    s_and_b32 s0, s0, 0xffff
130; GFX803-NEXT:    s_or_b32 s0, s0, 0x1c80000
131; GFX803-NEXT:    ;;#ASMSTART
132; GFX803-NEXT:    ; use s0
133; GFX803-NEXT:    ;;#ASMEND
134; GFX803-NEXT:    s_endpgm
135;
136; GFX7-LABEL: s_pack_v2i16_imm_hi:
137; GFX7:       ; %bb.0:
138; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
139; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
141; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
143; GFX7-NEXT:    s_or_b32 s0, s0, 0x1c80000
144; GFX7-NEXT:    ;;#ASMSTART
145; GFX7-NEXT:    ; use s0
146; GFX7-NEXT:    ;;#ASMEND
147; GFX7-NEXT:    s_endpgm
148  %val0 = load i32, ptr addrspace(4) %in0
149  %lo = trunc i32 %val0 to i16
150  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
151  %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1
152  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
153
154  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
155  ret void
156}
157
158define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
159; GFX9-LABEL: v_pack_v2i16:
160; GFX9:       ; %bb.0:
161; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
162; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
163; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
165; GFX9-NEXT:    s_waitcnt vmcnt(0)
166; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
167; GFX9-NEXT:    s_waitcnt vmcnt(0)
168; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
169; GFX9-NEXT:    v_perm_b32 v0, v2, v1, s0
170; GFX9-NEXT:    ;;#ASMSTART
171; GFX9-NEXT:    ; use v0
172; GFX9-NEXT:    ;;#ASMEND
173; GFX9-NEXT:    s_endpgm
174;
175; GFX803-LABEL: v_pack_v2i16:
176; GFX803:       ; %bb.0:
177; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
178; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
179; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX803-NEXT:    v_mov_b32_e32 v1, s1
181; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
182; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
183; GFX803-NEXT:    v_mov_b32_e32 v3, s3
184; GFX803-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
185; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
186; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
187; GFX803-NEXT:    s_waitcnt vmcnt(0)
188; GFX803-NEXT:    flat_load_dword v1, v[2:3] glc
189; GFX803-NEXT:    s_waitcnt vmcnt(0)
190; GFX803-NEXT:    s_mov_b32 s0, 0x1000504
191; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s0
192; GFX803-NEXT:    ;;#ASMSTART
193; GFX803-NEXT:    ; use v0
194; GFX803-NEXT:    ;;#ASMEND
195; GFX803-NEXT:    s_endpgm
196;
197; GFX7-LABEL: v_pack_v2i16:
198; GFX7:       ; %bb.0:
199; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
200; GFX7-NEXT:    s_mov_b32 s7, 0x100f000
201; GFX7-NEXT:    s_mov_b32 s6, 0
202; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
203; GFX7-NEXT:    v_mov_b32_e32 v1, 0
204; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX7-NEXT:    s_mov_b64 s[4:5], s[0:1]
206; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
207; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
208; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
209; GFX7-NEXT:    s_waitcnt vmcnt(0)
210; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
211; GFX7-NEXT:    s_waitcnt vmcnt(0)
212; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
213; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
214; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
215; GFX7-NEXT:    ;;#ASMSTART
216; GFX7-NEXT:    ; use v0
217; GFX7-NEXT:    ;;#ASMEND
218; GFX7-NEXT:    s_endpgm
219  %tid = call i32 @llvm.amdgcn.workitem.id.x()
220  %tid.ext = sext i32 %tid to i64
221  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
222  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
223  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
224  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
225  %lo = trunc i32 %val0 to i16
226  %hi = trunc i32 %val1 to i16
227  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
228  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
229  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
230  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
231  ret void
232}
233
234define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
235; GFX9-LABEL: v_pack_v2i16_user:
236; GFX9:       ; %bb.0:
237; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
238; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
241; GFX9-NEXT:    s_waitcnt vmcnt(0)
242; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
243; GFX9-NEXT:    s_waitcnt vmcnt(0)
244; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
245; GFX9-NEXT:    s_mov_b32 s3, 0xf000
246; GFX9-NEXT:    s_mov_b32 s2, -1
247; GFX9-NEXT:    v_perm_b32 v0, v2, v1, s0
248; GFX9-NEXT:    v_add_u32_e32 v0, 9, v0
249; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
250; GFX9-NEXT:    s_waitcnt vmcnt(0)
251; GFX9-NEXT:    s_endpgm
252;
253; GFX803-LABEL: v_pack_v2i16_user:
254; GFX803:       ; %bb.0:
255; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
256; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
257; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX803-NEXT:    v_mov_b32_e32 v1, s1
259; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
260; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
261; GFX803-NEXT:    v_mov_b32_e32 v3, s3
262; GFX803-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
263; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
264; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
265; GFX803-NEXT:    s_waitcnt vmcnt(0)
266; GFX803-NEXT:    flat_load_dword v1, v[2:3] glc
267; GFX803-NEXT:    s_waitcnt vmcnt(0)
268; GFX803-NEXT:    s_mov_b32 s0, 0x1000504
269; GFX803-NEXT:    s_mov_b32 s3, 0x1100f000
270; GFX803-NEXT:    s_mov_b32 s2, -1
271; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s0
272; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 9, v0
273; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0
274; GFX803-NEXT:    s_waitcnt vmcnt(0)
275; GFX803-NEXT:    s_endpgm
276;
277; GFX7-LABEL: v_pack_v2i16_user:
278; GFX7:       ; %bb.0:
279; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
280; GFX7-NEXT:    s_mov_b32 s6, 0
281; GFX7-NEXT:    s_mov_b32 s7, 0x100f000
282; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
283; GFX7-NEXT:    v_mov_b32_e32 v1, 0
284; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX7-NEXT:    s_mov_b64 s[4:5], s[0:1]
286; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
287; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
288; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
289; GFX7-NEXT:    s_waitcnt vmcnt(0)
290; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
291; GFX7-NEXT:    s_waitcnt vmcnt(0)
292; GFX7-NEXT:    s_mov_b32 s6, -1
293; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
294; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
295; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
296; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 9, v0
297; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
298; GFX7-NEXT:    s_waitcnt vmcnt(0)
299; GFX7-NEXT:    s_endpgm
300  %tid = call i32 @llvm.amdgcn.workitem.id.x()
301  %tid.ext = sext i32 %tid to i64
302  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
303  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
304  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
305  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
306  %lo = trunc i32 %val0 to i16
307  %hi = trunc i32 %val1 to i16
308  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
309  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
310  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
311  %foo = add i32 %vec.i32, 9
312  store volatile i32 %foo, ptr addrspace(1) undef
313  ret void
314}
315
316define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
317; GFX9-LABEL: v_pack_v2i16_imm_lo:
318; GFX9:       ; %bb.0:
319; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
320; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
321; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
324; GFX9-NEXT:    s_waitcnt vmcnt(0)
325; GFX9-NEXT:    s_movk_i32 s0, 0x7b
326; GFX9-NEXT:    v_perm_b32 v0, v0, s0, v1
327; GFX9-NEXT:    ;;#ASMSTART
328; GFX9-NEXT:    ; use v0
329; GFX9-NEXT:    ;;#ASMEND
330; GFX9-NEXT:    s_endpgm
331;
332; GFX803-LABEL: v_pack_v2i16_imm_lo:
333; GFX803:       ; %bb.0:
334; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
335; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
336; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX803-NEXT:    v_mov_b32_e32 v1, s1
338; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
339; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
340; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
341; GFX803-NEXT:    s_waitcnt vmcnt(0)
342; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
343; GFX803-NEXT:    v_or_b32_e32 v0, 0x7b, v0
344; GFX803-NEXT:    ;;#ASMSTART
345; GFX803-NEXT:    ; use v0
346; GFX803-NEXT:    ;;#ASMEND
347; GFX803-NEXT:    s_endpgm
348;
349; GFX7-LABEL: v_pack_v2i16_imm_lo:
350; GFX7:       ; %bb.0:
351; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
352; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
353; GFX7-NEXT:    s_mov_b32 s2, 0
354; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
355; GFX7-NEXT:    v_mov_b32_e32 v1, 0
356; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
358; GFX7-NEXT:    s_waitcnt vmcnt(0)
359; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
360; GFX7-NEXT:    v_or_b32_e32 v0, 0x7b, v0
361; GFX7-NEXT:    ;;#ASMSTART
362; GFX7-NEXT:    ; use v0
363; GFX7-NEXT:    ;;#ASMEND
364; GFX7-NEXT:    s_endpgm
365  %tid = call i32 @llvm.amdgcn.workitem.id.x()
366  %tid.ext = sext i32 %tid to i64
367  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
368  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
369  %hi = trunc i32 %val1 to i16
370  %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0
371  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
372  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
373  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
374  ret void
375}
376
377define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 {
378; GFX9-LABEL: v_pack_v2i16_inline_imm_lo:
379; GFX9:       ; %bb.0:
380; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
381; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
382; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
383; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
385; GFX9-NEXT:    s_waitcnt vmcnt(0)
386; GFX9-NEXT:    v_perm_b32 v0, v0, 64, v1
387; GFX9-NEXT:    ;;#ASMSTART
388; GFX9-NEXT:    ; use v0
389; GFX9-NEXT:    ;;#ASMEND
390; GFX9-NEXT:    s_endpgm
391;
392; GFX803-LABEL: v_pack_v2i16_inline_imm_lo:
393; GFX803:       ; %bb.0:
394; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
395; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
396; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX803-NEXT:    v_mov_b32_e32 v1, s1
398; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
399; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
400; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
401; GFX803-NEXT:    s_waitcnt vmcnt(0)
402; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
403; GFX803-NEXT:    v_or_b32_e32 v0, 64, v0
404; GFX803-NEXT:    ;;#ASMSTART
405; GFX803-NEXT:    ; use v0
406; GFX803-NEXT:    ;;#ASMEND
407; GFX803-NEXT:    s_endpgm
408;
409; GFX7-LABEL: v_pack_v2i16_inline_imm_lo:
410; GFX7:       ; %bb.0:
411; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
412; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
413; GFX7-NEXT:    s_mov_b32 s2, 0
414; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
415; GFX7-NEXT:    v_mov_b32_e32 v1, 0
416; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
418; GFX7-NEXT:    s_waitcnt vmcnt(0)
419; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
420; GFX7-NEXT:    v_or_b32_e32 v0, 64, v0
421; GFX7-NEXT:    ;;#ASMSTART
422; GFX7-NEXT:    ; use v0
423; GFX7-NEXT:    ;;#ASMEND
424; GFX7-NEXT:    s_endpgm
425  %tid = call i32 @llvm.amdgcn.workitem.id.x()
426  %tid.ext = sext i32 %tid to i64
427  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
428  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
429  %hi = trunc i32 %val1 to i16
430  %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0
431  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
432  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
433  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
434  ret void
435}
436
437define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
438; GFX9-LABEL: v_pack_v2i16_imm_hi:
439; GFX9:       ; %bb.0:
440; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
441; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
442; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
443; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
445; GFX9-NEXT:    s_waitcnt vmcnt(0)
446; GFX9-NEXT:    s_movk_i32 s0, 0x7b
447; GFX9-NEXT:    v_perm_b32 v0, s0, v0, v1
448; GFX9-NEXT:    ;;#ASMSTART
449; GFX9-NEXT:    ; use v0
450; GFX9-NEXT:    ;;#ASMEND
451; GFX9-NEXT:    s_endpgm
452;
453; GFX803-LABEL: v_pack_v2i16_imm_hi:
454; GFX803:       ; %bb.0:
455; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
456; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
457; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX803-NEXT:    v_mov_b32_e32 v1, s1
459; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
460; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
461; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
462; GFX803-NEXT:    s_waitcnt vmcnt(0)
463; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b0000
464; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
465; GFX803-NEXT:    ;;#ASMSTART
466; GFX803-NEXT:    ; use v0
467; GFX803-NEXT:    ;;#ASMEND
468; GFX803-NEXT:    s_endpgm
469;
470; GFX7-LABEL: v_pack_v2i16_imm_hi:
471; GFX7:       ; %bb.0:
472; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
473; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
474; GFX7-NEXT:    s_mov_b32 s2, 0
475; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
476; GFX7-NEXT:    v_mov_b32_e32 v1, 0
477; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
479; GFX7-NEXT:    s_waitcnt vmcnt(0)
480; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
481; GFX7-NEXT:    v_or_b32_e32 v0, 0x7b0000, v0
482; GFX7-NEXT:    ;;#ASMSTART
483; GFX7-NEXT:    ; use v0
484; GFX7-NEXT:    ;;#ASMEND
485; GFX7-NEXT:    s_endpgm
486  %tid = call i32 @llvm.amdgcn.workitem.id.x()
487  %tid.ext = sext i32 %tid to i64
488  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
489  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
490  %lo = trunc i32 %val0 to i16
491  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
492  %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1
493  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
494  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
495  ret void
496}
497
498define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 {
499; GFX9-LABEL: v_pack_v2i16_inline_imm_hi:
500; GFX9:       ; %bb.0:
501; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
502; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
503; GFX9-NEXT:    v_mov_b32_e32 v1, 0x5040100
504; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
506; GFX9-NEXT:    s_waitcnt vmcnt(0)
507; GFX9-NEXT:    v_perm_b32 v0, 7, v0, v1
508; GFX9-NEXT:    ;;#ASMSTART
509; GFX9-NEXT:    ; use v0
510; GFX9-NEXT:    ;;#ASMEND
511; GFX9-NEXT:    s_endpgm
512;
513; GFX803-LABEL: v_pack_v2i16_inline_imm_hi:
514; GFX803:       ; %bb.0:
515; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
516; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
517; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX803-NEXT:    v_mov_b32_e32 v1, s1
519; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
520; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
521; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
522; GFX803-NEXT:    s_waitcnt vmcnt(0)
523; GFX803-NEXT:    v_mov_b32_e32 v1, 0x70000
524; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
525; GFX803-NEXT:    ;;#ASMSTART
526; GFX803-NEXT:    ; use v0
527; GFX803-NEXT:    ;;#ASMEND
528; GFX803-NEXT:    s_endpgm
529;
530; GFX7-LABEL: v_pack_v2i16_inline_imm_hi:
531; GFX7:       ; %bb.0:
532; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
533; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
534; GFX7-NEXT:    s_mov_b32 s2, 0
535; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
536; GFX7-NEXT:    v_mov_b32_e32 v1, 0
537; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
539; GFX7-NEXT:    s_waitcnt vmcnt(0)
540; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
541; GFX7-NEXT:    v_or_b32_e32 v0, 0x70000, v0
542; GFX7-NEXT:    ;;#ASMSTART
543; GFX7-NEXT:    ; use v0
544; GFX7-NEXT:    ;;#ASMEND
545; GFX7-NEXT:    s_endpgm
546  %tid = call i32 @llvm.amdgcn.workitem.id.x()
547  %tid.ext = sext i32 %tid to i64
548  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
549  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
550  %lo = trunc i32 %val0 to i16
551  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
552  %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1
553  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
554  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
555  ret void
556}
557
558declare i32 @llvm.amdgcn.workitem.id.x() #1
559
560attributes #0 = { nounwind }
561attributes #1 = { nounwind readnone }
562
563