xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
5; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
6
7define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
8; GFX9-LABEL: s_insertelement_v2i16_0:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
11; GFX9-NEXT:    v_mov_b32_e32 v0, 0
12; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
16; GFX9-NEXT:    v_mov_b32_e32 v1, s2
17; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
18; GFX9-NEXT:    s_endpgm
19;
20; CIVI-LABEL: s_insertelement_v2i16_0:
21; CIVI:       ; %bb.0:
22; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
23; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
24; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
25; CIVI-NEXT:    v_mov_b32_e32 v0, s0
26; CIVI-NEXT:    v_mov_b32_e32 v1, s1
27; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
28; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
29; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e7
30; CIVI-NEXT:    v_mov_b32_e32 v2, s0
31; CIVI-NEXT:    flat_store_dword v[0:1], v2
32; CIVI-NEXT:    s_endpgm
33;
34; GFX11-LABEL: s_insertelement_v2i16_0:
35; GFX11:       ; %bb.0:
36; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
37; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
39; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
41; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
42; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
43; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
44; GFX11-NEXT:    s_endpgm
45  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
46  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
47  store <2 x i16> %vecins, ptr addrspace(1) %out
48  ret void
49}
50
51
52define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
53; GFX9-LABEL: s_insertelement_v2i16_0_reg:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
56; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x30
57; GFX9-NEXT:    v_mov_b32_e32 v0, 0
58; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
60; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s4, s2
62; GFX9-NEXT:    v_mov_b32_e32 v1, s2
63; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
64; GFX9-NEXT:    s_endpgm
65;
66; VI-LABEL: s_insertelement_v2i16_0_reg:
67; VI:       ; %bb.0:
68; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
69; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
70; VI-NEXT:    s_waitcnt lgkmcnt(0)
71; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
72; VI-NEXT:    v_mov_b32_e32 v0, s0
73; VI-NEXT:    v_mov_b32_e32 v1, s1
74; VI-NEXT:    s_and_b32 s0, s4, 0xffff
75; VI-NEXT:    s_waitcnt lgkmcnt(0)
76; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
77; VI-NEXT:    s_or_b32 s0, s0, s1
78; VI-NEXT:    v_mov_b32_e32 v2, s0
79; VI-NEXT:    flat_store_dword v[0:1], v2
80; VI-NEXT:    s_endpgm
81;
82; CI-LABEL: s_insertelement_v2i16_0_reg:
83; CI:       ; %bb.0:
84; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
85; CI-NEXT:    s_load_dword s4, s[8:9], 0xc
86; CI-NEXT:    s_waitcnt lgkmcnt(0)
87; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
88; CI-NEXT:    v_mov_b32_e32 v0, s0
89; CI-NEXT:    v_mov_b32_e32 v1, s1
90; CI-NEXT:    s_and_b32 s1, s4, 0xffff
91; CI-NEXT:    s_waitcnt lgkmcnt(0)
92; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
93; CI-NEXT:    s_or_b32 s0, s1, s0
94; CI-NEXT:    v_mov_b32_e32 v2, s0
95; CI-NEXT:    flat_store_dword v[0:1], v2
96; CI-NEXT:    s_endpgm
97;
98; GFX11-LABEL: s_insertelement_v2i16_0_reg:
99; GFX11:       ; %bb.0:
100; GFX11-NEXT:    s_clause 0x1
101; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
102; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x30
103; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
105; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX11-NEXT:    s_pack_lh_b32_b16 s2, s4, s2
107; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
108; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
109; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
110; GFX11-NEXT:    s_endpgm
111  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
112  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
113  store <2 x i16> %vecins, ptr addrspace(1) %out
114  ret void
115}
116
117define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
118; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
119; GFX9:       ; %bb.0:
120; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
121; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x30
122; GFX9-NEXT:    v_mov_b32_e32 v0, 0
123; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
125; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
127; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s2
128; GFX9-NEXT:    v_mov_b32_e32 v1, s3
129; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
130; GFX9-NEXT:    ;;#ASMSTART
131; GFX9-NEXT:    ; use s2
132; GFX9-NEXT:    ;;#ASMEND
133; GFX9-NEXT:    s_endpgm
134;
135; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
136; VI:       ; %bb.0:
137; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
138; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
139; VI-NEXT:    s_waitcnt lgkmcnt(0)
140; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
141; VI-NEXT:    v_mov_b32_e32 v0, s0
142; VI-NEXT:    v_mov_b32_e32 v1, s1
143; VI-NEXT:    s_and_b32 s0, s4, 0xffff
144; VI-NEXT:    s_waitcnt lgkmcnt(0)
145; VI-NEXT:    s_lshr_b32 s1, s2, 16
146; VI-NEXT:    s_and_b32 s2, s2, 0xffff0000
147; VI-NEXT:    s_or_b32 s0, s0, s2
148; VI-NEXT:    v_mov_b32_e32 v2, s0
149; VI-NEXT:    flat_store_dword v[0:1], v2
150; VI-NEXT:    ;;#ASMSTART
151; VI-NEXT:    ; use s1
152; VI-NEXT:    ;;#ASMEND
153; VI-NEXT:    s_endpgm
154;
155; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
156; CI:       ; %bb.0:
157; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
158; CI-NEXT:    s_load_dword s4, s[8:9], 0xc
159; CI-NEXT:    s_waitcnt lgkmcnt(0)
160; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
161; CI-NEXT:    v_mov_b32_e32 v0, s0
162; CI-NEXT:    v_mov_b32_e32 v1, s1
163; CI-NEXT:    s_and_b32 s0, s4, 0xffff
164; CI-NEXT:    s_waitcnt lgkmcnt(0)
165; CI-NEXT:    s_and_b32 s1, s2, 0xffff0000
166; CI-NEXT:    s_or_b32 s0, s0, s1
167; CI-NEXT:    v_mov_b32_e32 v2, s0
168; CI-NEXT:    s_lshr_b32 s2, s2, 16
169; CI-NEXT:    flat_store_dword v[0:1], v2
170; CI-NEXT:    ;;#ASMSTART
171; CI-NEXT:    ; use s2
172; CI-NEXT:    ;;#ASMEND
173; CI-NEXT:    s_endpgm
174;
175; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
176; GFX11:       ; %bb.0:
177; GFX11-NEXT:    s_clause 0x1
178; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
179; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x30
180; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
182; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
184; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
185; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s4, s2
186; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
187; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
188; GFX11-NEXT:    ;;#ASMSTART
189; GFX11-NEXT:    ; use s2
190; GFX11-NEXT:    ;;#ASMEND
191; GFX11-NEXT:    s_endpgm
192  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
193  %elt1 = extractelement <2 x i16> %vec, i32 1
194  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
195  store <2 x i16> %vecins, ptr addrspace(1) %out
196  %use1 = zext i16 %elt1 to i32
197  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
198  ret void
199}
200
201define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
202; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
203; GFX9:       ; %bb.0:
204; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
205; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x30
206; GFX9-NEXT:    v_mov_b32_e32 v0, 0
207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
209; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s4, s2
211; GFX9-NEXT:    v_mov_b32_e32 v1, s2
212; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
213; GFX9-NEXT:    s_endpgm
214;
215; VI-LABEL: s_insertelement_v2i16_0_reghi:
216; VI:       ; %bb.0:
217; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
218; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
219; VI-NEXT:    s_waitcnt lgkmcnt(0)
220; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
221; VI-NEXT:    v_mov_b32_e32 v0, s0
222; VI-NEXT:    v_mov_b32_e32 v2, s4
223; VI-NEXT:    v_mov_b32_e32 v1, s1
224; VI-NEXT:    s_waitcnt lgkmcnt(0)
225; VI-NEXT:    s_lshr_b32 s0, s2, 16
226; VI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
227; VI-NEXT:    flat_store_dword v[0:1], v2
228; VI-NEXT:    s_endpgm
229;
230; CI-LABEL: s_insertelement_v2i16_0_reghi:
231; CI:       ; %bb.0:
232; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
233; CI-NEXT:    s_load_dword s4, s[8:9], 0xc
234; CI-NEXT:    s_waitcnt lgkmcnt(0)
235; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
236; CI-NEXT:    v_mov_b32_e32 v0, s0
237; CI-NEXT:    v_mov_b32_e32 v2, s4
238; CI-NEXT:    v_mov_b32_e32 v1, s1
239; CI-NEXT:    s_waitcnt lgkmcnt(0)
240; CI-NEXT:    s_lshr_b32 s0, s2, 16
241; CI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
242; CI-NEXT:    flat_store_dword v[0:1], v2
243; CI-NEXT:    s_endpgm
244;
245; GFX11-LABEL: s_insertelement_v2i16_0_reghi:
246; GFX11:       ; %bb.0:
247; GFX11-NEXT:    s_clause 0x1
248; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
249; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x30
250; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
252; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX11-NEXT:    s_pack_hh_b32_b16 s2, s4, s2
254; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
255; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
256; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
257; GFX11-NEXT:    s_endpgm
258  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
259  %elt.hi = lshr i32 %elt.arg, 16
260  %elt = trunc i32 %elt.hi to i16
261  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
262  store <2 x i16> %vecins, ptr addrspace(1) %out
263  ret void
264}
265
266define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
267; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
268; GFX9:       ; %bb.0:
269; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
270; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
271; GFX9-NEXT:    v_mov_b32_e32 v0, 0
272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
274; GFX9-NEXT:    s_lshr_b32 s3, s4, 16
275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s3, s2
277; GFX9-NEXT:    v_mov_b32_e32 v1, s2
278; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
279; GFX9-NEXT:    ;;#ASMSTART
280; GFX9-NEXT:    ; use s3
281; GFX9-NEXT:    ;;#ASMEND
282; GFX9-NEXT:    s_endpgm
283;
284; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
285; VI:       ; %bb.0:
286; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
287; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
288; VI-NEXT:    s_waitcnt lgkmcnt(0)
289; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
290; VI-NEXT:    v_mov_b32_e32 v1, s1
291; VI-NEXT:    v_mov_b32_e32 v2, s4
292; VI-NEXT:    v_mov_b32_e32 v0, s0
293; VI-NEXT:    s_lshr_b32 s0, s4, 16
294; VI-NEXT:    s_waitcnt lgkmcnt(0)
295; VI-NEXT:    s_lshr_b32 s1, s2, 16
296; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
297; VI-NEXT:    flat_store_dword v[0:1], v2
298; VI-NEXT:    ;;#ASMSTART
299; VI-NEXT:    ; use s0
300; VI-NEXT:    ;;#ASMEND
301; VI-NEXT:    s_endpgm
302;
303; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
304; CI:       ; %bb.0:
305; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
306; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
307; CI-NEXT:    s_waitcnt lgkmcnt(0)
308; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
309; CI-NEXT:    v_mov_b32_e32 v1, s1
310; CI-NEXT:    v_mov_b32_e32 v2, s4
311; CI-NEXT:    v_mov_b32_e32 v0, s0
312; CI-NEXT:    s_lshr_b32 s0, s4, 16
313; CI-NEXT:    s_waitcnt lgkmcnt(0)
314; CI-NEXT:    s_lshr_b32 s1, s2, 16
315; CI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
316; CI-NEXT:    flat_store_dword v[0:1], v2
317; CI-NEXT:    ;;#ASMSTART
318; CI-NEXT:    ; use s0
319; CI-NEXT:    ;;#ASMEND
320; CI-NEXT:    s_endpgm
321;
322; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
323; GFX11:       ; %bb.0:
324; GFX11-NEXT:    s_clause 0x1
325; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
326; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
327; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
329; GFX11-NEXT:    s_lshr_b32 s3, s4, 16
330; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX11-NEXT:    s_pack_lh_b32_b16 s2, s3, s2
332; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
333; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
334; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
335; GFX11-NEXT:    ;;#ASMSTART
336; GFX11-NEXT:    ; use s3
337; GFX11-NEXT:    ;;#ASMEND
338; GFX11-NEXT:    s_endpgm
339  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
340  %elt.hi = lshr i32 %elt.arg, 16
341  %elt = trunc i32 %elt.hi to i16
342  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
343  store <2 x i16> %vecins, ptr addrspace(1) %out
344  %use1 = zext i16 %elt to i32
345  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
346  ret void
347}
348
349define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
350; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
351; GFX9:       ; %bb.0:
352; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
353; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
354; GFX9-NEXT:    v_mov_b32_e32 v0, 0
355; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
357; GFX9-NEXT:    s_lshr_b32 s3, s4, 16
358; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
360; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s3, s2
361; GFX9-NEXT:    v_mov_b32_e32 v1, s4
362; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
363; GFX9-NEXT:    ;;#ASMSTART
364; GFX9-NEXT:    ; use s3
365; GFX9-NEXT:    ;;#ASMEND
366; GFX9-NEXT:    ;;#ASMSTART
367; GFX9-NEXT:    ; use s2
368; GFX9-NEXT:    ;;#ASMEND
369; GFX9-NEXT:    s_endpgm
370;
371; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
372; VI:       ; %bb.0:
373; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
374; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
375; VI-NEXT:    s_waitcnt lgkmcnt(0)
376; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
377; VI-NEXT:    v_mov_b32_e32 v1, s1
378; VI-NEXT:    v_mov_b32_e32 v2, s4
379; VI-NEXT:    v_mov_b32_e32 v0, s0
380; VI-NEXT:    s_lshr_b32 s0, s4, 16
381; VI-NEXT:    s_waitcnt lgkmcnt(0)
382; VI-NEXT:    s_lshr_b32 s1, s2, 16
383; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
384; VI-NEXT:    flat_store_dword v[0:1], v2
385; VI-NEXT:    ;;#ASMSTART
386; VI-NEXT:    ; use s0
387; VI-NEXT:    ;;#ASMEND
388; VI-NEXT:    ;;#ASMSTART
389; VI-NEXT:    ; use s1
390; VI-NEXT:    ;;#ASMEND
391; VI-NEXT:    s_endpgm
392;
393; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
394; CI:       ; %bb.0:
395; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
396; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
397; CI-NEXT:    s_waitcnt lgkmcnt(0)
398; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
399; CI-NEXT:    v_mov_b32_e32 v1, s1
400; CI-NEXT:    v_mov_b32_e32 v2, s4
401; CI-NEXT:    v_mov_b32_e32 v0, s0
402; CI-NEXT:    s_lshr_b32 s0, s4, 16
403; CI-NEXT:    s_waitcnt lgkmcnt(0)
404; CI-NEXT:    s_lshr_b32 s1, s2, 16
405; CI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
406; CI-NEXT:    flat_store_dword v[0:1], v2
407; CI-NEXT:    ;;#ASMSTART
408; CI-NEXT:    ; use s0
409; CI-NEXT:    ;;#ASMEND
410; CI-NEXT:    ;;#ASMSTART
411; CI-NEXT:    ; use s1
412; CI-NEXT:    ;;#ASMEND
413; CI-NEXT:    s_endpgm
414;
415; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
416; GFX11:       ; %bb.0:
417; GFX11-NEXT:    s_clause 0x1
418; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
419; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
420; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
422; GFX11-NEXT:    s_lshr_b32 s3, s4, 16
423; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
425; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
426; GFX11-NEXT:    s_pack_ll_b32_b16 s4, s3, s2
427; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
428; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
429; GFX11-NEXT:    ;;#ASMSTART
430; GFX11-NEXT:    ; use s3
431; GFX11-NEXT:    ;;#ASMEND
432; GFX11-NEXT:    ;;#ASMSTART
433; GFX11-NEXT:    ; use s2
434; GFX11-NEXT:    ;;#ASMEND
435; GFX11-NEXT:    s_endpgm
436  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
437  %elt.hi = lshr i32 %elt.arg, 16
438  %elt = trunc i32 %elt.hi to i16
439  %vec.hi = extractelement <2 x i16> %vec, i32 1
440  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
441  store <2 x i16> %vecins, ptr addrspace(1) %out
442  %use1 = zext i16 %elt to i32
443  %vec.hi.use1 = zext i16 %vec.hi to i32
444
445  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
446  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
447  ret void
448}
449
450define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
451; GFX9-LABEL: s_insertelement_v2i16_1:
452; GFX9:       ; %bb.0:
453; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
454; GFX9-NEXT:    v_mov_b32_e32 v0, 0
455; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
457; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
459; GFX9-NEXT:    v_mov_b32_e32 v1, s2
460; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
461; GFX9-NEXT:    s_endpgm
462;
463; CIVI-LABEL: s_insertelement_v2i16_1:
464; CIVI:       ; %bb.0:
465; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
466; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
467; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
468; CIVI-NEXT:    v_mov_b32_e32 v0, s0
469; CIVI-NEXT:    v_mov_b32_e32 v1, s1
470; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
471; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
472; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e70000
473; CIVI-NEXT:    v_mov_b32_e32 v2, s0
474; CIVI-NEXT:    flat_store_dword v[0:1], v2
475; CIVI-NEXT:    s_endpgm
476;
477; GFX11-LABEL: s_insertelement_v2i16_1:
478; GFX11:       ; %bb.0:
479; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
480; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
482; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
484; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
485; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
486; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
487; GFX11-NEXT:    s_endpgm
488  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
489  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
490  store <2 x i16> %vecins, ptr addrspace(1) %out
491  ret void
492}
493
494define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
495; GFX9-LABEL: s_insertelement_v2i16_1_reg:
496; GFX9:       ; %bb.0:
497; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
498; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x30
499; GFX9-NEXT:    v_mov_b32_e32 v0, 0
500; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
502; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
504; GFX9-NEXT:    v_mov_b32_e32 v1, s2
505; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
506; GFX9-NEXT:    s_endpgm
507;
508; VI-LABEL: s_insertelement_v2i16_1_reg:
509; VI:       ; %bb.0:
510; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
511; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
512; VI-NEXT:    s_waitcnt lgkmcnt(0)
513; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
514; VI-NEXT:    v_mov_b32_e32 v0, s0
515; VI-NEXT:    v_mov_b32_e32 v1, s1
516; VI-NEXT:    s_lshl_b32 s0, s4, 16
517; VI-NEXT:    s_waitcnt lgkmcnt(0)
518; VI-NEXT:    s_and_b32 s1, s2, 0xffff
519; VI-NEXT:    s_or_b32 s0, s1, s0
520; VI-NEXT:    v_mov_b32_e32 v2, s0
521; VI-NEXT:    flat_store_dword v[0:1], v2
522; VI-NEXT:    s_endpgm
523;
524; CI-LABEL: s_insertelement_v2i16_1_reg:
525; CI:       ; %bb.0:
526; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
527; CI-NEXT:    s_load_dword s4, s[8:9], 0xc
528; CI-NEXT:    s_waitcnt lgkmcnt(0)
529; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
530; CI-NEXT:    v_mov_b32_e32 v0, s0
531; CI-NEXT:    v_mov_b32_e32 v1, s1
532; CI-NEXT:    s_lshl_b32 s1, s4, 16
533; CI-NEXT:    s_waitcnt lgkmcnt(0)
534; CI-NEXT:    s_and_b32 s0, s2, 0xffff
535; CI-NEXT:    s_or_b32 s0, s0, s1
536; CI-NEXT:    v_mov_b32_e32 v2, s0
537; CI-NEXT:    flat_store_dword v[0:1], v2
538; CI-NEXT:    s_endpgm
539;
540; GFX11-LABEL: s_insertelement_v2i16_1_reg:
541; GFX11:       ; %bb.0:
542; GFX11-NEXT:    s_clause 0x1
543; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
544; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x30
545; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
546; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
547; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
549; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
550; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
551; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
552; GFX11-NEXT:    s_endpgm
553  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
554  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
555  store <2 x i16> %vecins, ptr addrspace(1) %out
556  ret void
557}
558
559define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
560; GFX9-LABEL: s_insertelement_v2f16_0:
561; GFX9:       ; %bb.0:
562; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
563; GFX9-NEXT:    v_mov_b32_e32 v0, 0
564; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
566; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
568; GFX9-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
569; GFX9-NEXT:    v_mov_b32_e32 v1, s2
570; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
571; GFX9-NEXT:    s_endpgm
572;
573; CIVI-LABEL: s_insertelement_v2f16_0:
574; CIVI:       ; %bb.0:
575; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
576; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
577; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
578; CIVI-NEXT:    v_mov_b32_e32 v0, s0
579; CIVI-NEXT:    v_mov_b32_e32 v1, s1
580; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
581; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
582; CIVI-NEXT:    s_or_b32 s0, s0, 0x4500
583; CIVI-NEXT:    v_mov_b32_e32 v2, s0
584; CIVI-NEXT:    flat_store_dword v[0:1], v2
585; CIVI-NEXT:    s_endpgm
586;
587; GFX11-LABEL: s_insertelement_v2f16_0:
588; GFX11:       ; %bb.0:
589; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
590; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
591; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
592; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
594; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
595; GFX11-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
596; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
597; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
598; GFX11-NEXT:    s_endpgm
599  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
600  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
601  store <2 x half> %vecins, ptr addrspace(1) %out
602  ret void
603}
604
605define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
606; GFX9-LABEL: s_insertelement_v2f16_1:
607; GFX9:       ; %bb.0:
608; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
609; GFX9-NEXT:    v_mov_b32_e32 v0, 0
610; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
612; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
614; GFX9-NEXT:    v_mov_b32_e32 v1, s2
615; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
616; GFX9-NEXT:    s_endpgm
617;
618; CIVI-LABEL: s_insertelement_v2f16_1:
619; CIVI:       ; %bb.0:
620; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
621; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
622; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
623; CIVI-NEXT:    v_mov_b32_e32 v0, s0
624; CIVI-NEXT:    v_mov_b32_e32 v1, s1
625; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
626; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
627; CIVI-NEXT:    s_or_b32 s0, s0, 0x45000000
628; CIVI-NEXT:    v_mov_b32_e32 v2, s0
629; CIVI-NEXT:    flat_store_dword v[0:1], v2
630; CIVI-NEXT:    s_endpgm
631;
632; GFX11-LABEL: s_insertelement_v2f16_1:
633; GFX11:       ; %bb.0:
634; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
635; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
637; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
639; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
640; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
641; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
642; GFX11-NEXT:    s_endpgm
643  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
644  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
645  store <2 x half> %vecins, ptr addrspace(1) %out
646  ret void
647}
648
649define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
650; GFX9-LABEL: v_insertelement_v2i16_0:
651; GFX9:       ; %bb.0:
652; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
653; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
654; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e7
655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
657; GFX9-NEXT:    s_mov_b32 s2, 0xffff
658; GFX9-NEXT:    s_waitcnt vmcnt(0)
659; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
660; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
661; GFX9-NEXT:    s_endpgm
662;
663; VI-LABEL: v_insertelement_v2i16_0:
664; VI:       ; %bb.0:
665; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
666; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
667; VI-NEXT:    s_waitcnt lgkmcnt(0)
668; VI-NEXT:    v_mov_b32_e32 v1, s3
669; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
670; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
671; VI-NEXT:    flat_load_dword v3, v[0:1]
672; VI-NEXT:    v_mov_b32_e32 v1, s1
673; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
674; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
675; VI-NEXT:    s_waitcnt vmcnt(0)
676; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
677; VI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
678; VI-NEXT:    flat_store_dword v[0:1], v2
679; VI-NEXT:    s_endpgm
680;
681; CI-LABEL: v_insertelement_v2i16_0:
682; CI:       ; %bb.0:
683; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
684; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
685; CI-NEXT:    s_waitcnt lgkmcnt(0)
686; CI-NEXT:    v_mov_b32_e32 v1, s3
687; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
688; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
689; CI-NEXT:    flat_load_dword v3, v[0:1]
690; CI-NEXT:    v_mov_b32_e32 v1, s1
691; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
692; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
693; CI-NEXT:    s_waitcnt vmcnt(0)
694; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
695; CI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
696; CI-NEXT:    flat_store_dword v[0:1], v2
697; CI-NEXT:    s_endpgm
698;
699; GFX11-LABEL: v_insertelement_v2i16_0:
700; GFX11:       ; %bb.0:
701; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
702; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
703; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
704; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
705; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
706; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
707; GFX11-NEXT:    s_movk_i32 s2, 0x3e7
708; GFX11-NEXT:    s_waitcnt vmcnt(0)
709; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s2, v1
710; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
711; GFX11-NEXT:    s_endpgm
712  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
713  %tid.ext = sext i32 %tid to i64
714  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
715  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
716  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
717  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
718  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
719  ret void
720}
721
722define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 {
723; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
724; GFX9:       ; %bb.0:
725; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
726; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
727; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
728; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7060302
729; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
731; GFX9-NEXT:    s_waitcnt vmcnt(0)
732; GFX9-NEXT:    v_perm_b32 v1, v1, s4, v2
733; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
734; GFX9-NEXT:    s_endpgm
735;
736; VI-LABEL: v_insertelement_v2i16_0_reghi:
737; VI:       ; %bb.0:
738; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
739; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
740; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
741; VI-NEXT:    s_waitcnt lgkmcnt(0)
742; VI-NEXT:    v_mov_b32_e32 v1, s3
743; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
744; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
745; VI-NEXT:    flat_load_dword v3, v[0:1]
746; VI-NEXT:    v_mov_b32_e32 v1, s1
747; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
748; VI-NEXT:    v_mov_b32_e32 v2, 0x3020706
749; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
750; VI-NEXT:    s_waitcnt vmcnt(0)
751; VI-NEXT:    v_perm_b32 v2, s4, v3, v2
752; VI-NEXT:    flat_store_dword v[0:1], v2
753; VI-NEXT:    s_endpgm
754;
755; CI-LABEL: v_insertelement_v2i16_0_reghi:
756; CI:       ; %bb.0:
757; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
758; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
759; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
760; CI-NEXT:    s_waitcnt lgkmcnt(0)
761; CI-NEXT:    v_mov_b32_e32 v1, s3
762; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
763; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
764; CI-NEXT:    flat_load_dword v3, v[0:1]
765; CI-NEXT:    v_mov_b32_e32 v1, s1
766; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
767; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
768; CI-NEXT:    s_waitcnt vmcnt(0)
769; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
770; CI-NEXT:    v_alignbit_b32 v2, v2, s4, 16
771; CI-NEXT:    flat_store_dword v[0:1], v2
772; CI-NEXT:    s_endpgm
773;
774; GFX11-LABEL: v_insertelement_v2i16_0_reghi:
775; GFX11:       ; %bb.0:
776; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
777; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
778; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
779; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
780; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
781; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
783; GFX11-NEXT:    s_waitcnt vmcnt(0)
784; GFX11-NEXT:    v_perm_b32 v1, v1, s4, 0x7060302
785; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
786; GFX11-NEXT:    s_endpgm
787  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
788  %tid.ext = sext i32 %tid to i64
789  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
790  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
791  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
792  %elt.hi = lshr i32 %elt.arg, 16
793  %elt = trunc i32 %elt.hi to i16
794  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
795  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
796  ret void
797}
798
799define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
800; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
801; GFX9:       ; %bb.0:
802; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
803; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
806; GFX9-NEXT:    s_mov_b32 s2, 0xffff
807; GFX9-NEXT:    s_waitcnt vmcnt(0)
808; GFX9-NEXT:    v_bfi_b32 v1, s2, 53, v1
809; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
810; GFX9-NEXT:    s_endpgm
811;
812; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
813; VI:       ; %bb.0:
814; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
815; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
816; VI-NEXT:    s_waitcnt lgkmcnt(0)
817; VI-NEXT:    v_mov_b32_e32 v1, s3
818; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
819; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
820; VI-NEXT:    flat_load_dword v3, v[0:1]
821; VI-NEXT:    v_mov_b32_e32 v1, s1
822; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
823; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
824; VI-NEXT:    s_waitcnt vmcnt(0)
825; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
826; VI-NEXT:    v_or_b32_e32 v2, 53, v2
827; VI-NEXT:    flat_store_dword v[0:1], v2
828; VI-NEXT:    s_endpgm
829;
830; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
831; CI:       ; %bb.0:
832; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
833; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
834; CI-NEXT:    s_waitcnt lgkmcnt(0)
835; CI-NEXT:    v_mov_b32_e32 v1, s3
836; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
837; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
838; CI-NEXT:    flat_load_dword v3, v[0:1]
839; CI-NEXT:    v_mov_b32_e32 v1, s1
840; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
841; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
842; CI-NEXT:    s_waitcnt vmcnt(0)
843; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
844; CI-NEXT:    v_or_b32_e32 v2, 53, v2
845; CI-NEXT:    flat_store_dword v[0:1], v2
846; CI-NEXT:    s_endpgm
847;
848; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm:
849; GFX11:       ; %bb.0:
850; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
851; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
852; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
853; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
854; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
856; GFX11-NEXT:    s_waitcnt vmcnt(0)
857; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, 53, v1
858; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
859; GFX11-NEXT:    s_endpgm
860  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
861  %tid.ext = sext i32 %tid to i64
862  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
863  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
864  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
865  %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
866  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
867  ret void
868}
869
870; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
871define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
872; GFX9-LABEL: v_insertelement_v2i16_1:
873; GFX9:       ; %bb.0:
874; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
875; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
876; GFX9-NEXT:    v_mov_b32_e32 v2, 0x5040100
877; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
879; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
880; GFX9-NEXT:    s_waitcnt vmcnt(0)
881; GFX9-NEXT:    v_perm_b32 v1, s2, v1, v2
882; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
883; GFX9-NEXT:    s_endpgm
884;
885; VI-LABEL: v_insertelement_v2i16_1:
886; VI:       ; %bb.0:
887; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
888; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
889; VI-NEXT:    s_waitcnt lgkmcnt(0)
890; VI-NEXT:    v_mov_b32_e32 v1, s3
891; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
892; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
893; VI-NEXT:    flat_load_dword v3, v[0:1]
894; VI-NEXT:    v_mov_b32_e32 v1, s1
895; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
896; VI-NEXT:    v_mov_b32_e32 v2, 0x3e70000
897; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
898; VI-NEXT:    s_waitcnt vmcnt(0)
899; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
900; VI-NEXT:    flat_store_dword v[0:1], v2
901; VI-NEXT:    s_endpgm
902;
903; CI-LABEL: v_insertelement_v2i16_1:
904; CI:       ; %bb.0:
905; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
906; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
907; CI-NEXT:    s_waitcnt lgkmcnt(0)
908; CI-NEXT:    v_mov_b32_e32 v1, s3
909; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
910; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
911; CI-NEXT:    flat_load_dword v3, v[0:1]
912; CI-NEXT:    v_mov_b32_e32 v1, s1
913; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
914; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
915; CI-NEXT:    s_waitcnt vmcnt(0)
916; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
917; CI-NEXT:    v_or_b32_e32 v2, 0x3e70000, v2
918; CI-NEXT:    flat_store_dword v[0:1], v2
919; CI-NEXT:    s_endpgm
920;
921; GFX11-LABEL: v_insertelement_v2i16_1:
922; GFX11:       ; %bb.0:
923; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
924; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
925; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
926; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
927; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
929; GFX11-NEXT:    s_movk_i32 s2, 0x3e7
930; GFX11-NEXT:    s_waitcnt vmcnt(0)
931; GFX11-NEXT:    v_perm_b32 v1, s2, v1, 0x5040100
932; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
933; GFX11-NEXT:    s_endpgm
934  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
935  %tid.ext = sext i32 %tid to i64
936  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
937  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
938  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
939  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
940  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
941  ret void
942}
943
944define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
945; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
946; GFX9:       ; %bb.0:
947; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
948; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
949; GFX9-NEXT:    v_mov_b32_e32 v2, 0x5040100
950; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
952; GFX9-NEXT:    s_waitcnt vmcnt(0)
953; GFX9-NEXT:    v_perm_b32 v1, -15, v1, v2
954; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
955; GFX9-NEXT:    s_endpgm
956;
957; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
958; VI:       ; %bb.0:
959; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
960; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
961; VI-NEXT:    s_waitcnt lgkmcnt(0)
962; VI-NEXT:    v_mov_b32_e32 v1, s3
963; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
964; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
965; VI-NEXT:    flat_load_dword v3, v[0:1]
966; VI-NEXT:    v_mov_b32_e32 v1, s1
967; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
968; VI-NEXT:    v_mov_b32_e32 v2, 0xfff10000
969; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
970; VI-NEXT:    s_waitcnt vmcnt(0)
971; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
972; VI-NEXT:    flat_store_dword v[0:1], v2
973; VI-NEXT:    s_endpgm
974;
975; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
976; CI:       ; %bb.0:
977; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
978; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
979; CI-NEXT:    s_waitcnt lgkmcnt(0)
980; CI-NEXT:    v_mov_b32_e32 v1, s3
981; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
982; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
983; CI-NEXT:    flat_load_dword v3, v[0:1]
984; CI-NEXT:    v_mov_b32_e32 v1, s1
985; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
986; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
987; CI-NEXT:    s_waitcnt vmcnt(0)
988; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
989; CI-NEXT:    v_or_b32_e32 v2, 0xfff10000, v2
990; CI-NEXT:    flat_store_dword v[0:1], v2
991; CI-NEXT:    s_endpgm
992;
993; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm:
994; GFX11:       ; %bb.0:
995; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
996; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
998; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
999; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1001; GFX11-NEXT:    s_waitcnt vmcnt(0)
1002; GFX11-NEXT:    v_perm_b32 v1, -15, v1, 0x5040100
1003; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1004; GFX11-NEXT:    s_endpgm
1005  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1006  %tid.ext = sext i32 %tid to i64
1007  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1008  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1009  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
1010  %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
1011  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
1012  ret void
1013}
1014
1015define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1016; GFX9-LABEL: v_insertelement_v2f16_0:
1017; GFX9:       ; %bb.0:
1018; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1019; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1020; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4500
1021; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1023; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1024; GFX9-NEXT:    s_waitcnt vmcnt(0)
1025; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
1026; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1027; GFX9-NEXT:    s_endpgm
1028;
1029; VI-LABEL: v_insertelement_v2f16_0:
1030; VI:       ; %bb.0:
1031; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1032; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1033; VI-NEXT:    s_waitcnt lgkmcnt(0)
1034; VI-NEXT:    v_mov_b32_e32 v1, s3
1035; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1036; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1037; VI-NEXT:    flat_load_dword v3, v[0:1]
1038; VI-NEXT:    v_mov_b32_e32 v1, s1
1039; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1040; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1041; VI-NEXT:    s_waitcnt vmcnt(0)
1042; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1043; VI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
1044; VI-NEXT:    flat_store_dword v[0:1], v2
1045; VI-NEXT:    s_endpgm
1046;
1047; CI-LABEL: v_insertelement_v2f16_0:
1048; CI:       ; %bb.0:
1049; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1050; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1051; CI-NEXT:    s_waitcnt lgkmcnt(0)
1052; CI-NEXT:    v_mov_b32_e32 v1, s3
1053; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1054; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1055; CI-NEXT:    flat_load_dword v3, v[0:1]
1056; CI-NEXT:    v_mov_b32_e32 v1, s1
1057; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1058; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1059; CI-NEXT:    s_waitcnt vmcnt(0)
1060; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1061; CI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
1062; CI-NEXT:    flat_store_dword v[0:1], v2
1063; CI-NEXT:    s_endpgm
1064;
1065; GFX11-LABEL: v_insertelement_v2f16_0:
1066; GFX11:       ; %bb.0:
1067; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1068; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1069; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1070; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1071; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1072; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1073; GFX11-NEXT:    s_movk_i32 s2, 0x4500
1074; GFX11-NEXT:    s_waitcnt vmcnt(0)
1075; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s2, v1
1076; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1077; GFX11-NEXT:    s_endpgm
1078  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1079  %tid.ext = sext i32 %tid to i64
1080  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1081  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1082  %vec = load <2 x half>, ptr addrspace(1) %in.gep
1083  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
1084  store <2 x half> %vecins, ptr addrspace(1) %out.gep
1085  ret void
1086}
1087
1088define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1089; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
1090; GFX9:       ; %bb.0:
1091; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1092; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1093; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1095; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1096; GFX9-NEXT:    s_waitcnt vmcnt(0)
1097; GFX9-NEXT:    v_bfi_b32 v1, s2, 53, v1
1098; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1099; GFX9-NEXT:    s_endpgm
1100;
1101; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
1102; VI:       ; %bb.0:
1103; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1104; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1105; VI-NEXT:    s_waitcnt lgkmcnt(0)
1106; VI-NEXT:    v_mov_b32_e32 v1, s3
1107; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1108; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1109; VI-NEXT:    flat_load_dword v3, v[0:1]
1110; VI-NEXT:    v_mov_b32_e32 v1, s1
1111; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1112; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1113; VI-NEXT:    s_waitcnt vmcnt(0)
1114; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1115; VI-NEXT:    v_or_b32_e32 v2, 53, v2
1116; VI-NEXT:    flat_store_dword v[0:1], v2
1117; VI-NEXT:    s_endpgm
1118;
1119; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
1120; CI:       ; %bb.0:
1121; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1122; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1123; CI-NEXT:    s_waitcnt lgkmcnt(0)
1124; CI-NEXT:    v_mov_b32_e32 v1, s3
1125; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1126; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1127; CI-NEXT:    flat_load_dword v3, v[0:1]
1128; CI-NEXT:    v_mov_b32_e32 v1, s1
1129; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1130; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1131; CI-NEXT:    s_waitcnt vmcnt(0)
1132; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
1133; CI-NEXT:    v_or_b32_e32 v2, 53, v2
1134; CI-NEXT:    flat_store_dword v[0:1], v2
1135; CI-NEXT:    s_endpgm
1136;
1137; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm:
1138; GFX11:       ; %bb.0:
1139; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1140; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1141; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1142; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1143; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1144; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1145; GFX11-NEXT:    s_waitcnt vmcnt(0)
1146; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, 53, v1
1147; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1148; GFX11-NEXT:    s_endpgm
1149  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1150  %tid.ext = sext i32 %tid to i64
1151  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1152  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1153  %vec = load <2 x half>, ptr addrspace(1) %in.gep
1154  %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
1155  store <2 x half> %vecins, ptr addrspace(1) %out.gep
1156  ret void
1157}
1158
1159define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1160; GFX9-LABEL: v_insertelement_v2f16_1:
1161; GFX9:       ; %bb.0:
1162; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1163; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1164; GFX9-NEXT:    v_mov_b32_e32 v2, 0x5040100
1165; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1166; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1167; GFX9-NEXT:    s_movk_i32 s2, 0x4500
1168; GFX9-NEXT:    s_waitcnt vmcnt(0)
1169; GFX9-NEXT:    v_perm_b32 v1, s2, v1, v2
1170; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1171; GFX9-NEXT:    s_endpgm
1172;
1173; VI-LABEL: v_insertelement_v2f16_1:
1174; VI:       ; %bb.0:
1175; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1176; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1177; VI-NEXT:    s_waitcnt lgkmcnt(0)
1178; VI-NEXT:    v_mov_b32_e32 v1, s3
1179; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1180; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1181; VI-NEXT:    flat_load_dword v3, v[0:1]
1182; VI-NEXT:    v_mov_b32_e32 v1, s1
1183; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1184; VI-NEXT:    v_mov_b32_e32 v2, 0x45000000
1185; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1186; VI-NEXT:    s_waitcnt vmcnt(0)
1187; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1188; VI-NEXT:    flat_store_dword v[0:1], v2
1189; VI-NEXT:    s_endpgm
1190;
1191; CI-LABEL: v_insertelement_v2f16_1:
1192; CI:       ; %bb.0:
1193; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1194; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1195; CI-NEXT:    s_waitcnt lgkmcnt(0)
1196; CI-NEXT:    v_mov_b32_e32 v1, s3
1197; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1198; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1199; CI-NEXT:    flat_load_dword v3, v[0:1]
1200; CI-NEXT:    v_mov_b32_e32 v1, s1
1201; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1202; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1203; CI-NEXT:    s_waitcnt vmcnt(0)
1204; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
1205; CI-NEXT:    v_or_b32_e32 v2, 0x45000000, v2
1206; CI-NEXT:    flat_store_dword v[0:1], v2
1207; CI-NEXT:    s_endpgm
1208;
1209; GFX11-LABEL: v_insertelement_v2f16_1:
1210; GFX11:       ; %bb.0:
1211; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1212; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1213; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1214; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1215; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1217; GFX11-NEXT:    s_movk_i32 s2, 0x4500
1218; GFX11-NEXT:    s_waitcnt vmcnt(0)
1219; GFX11-NEXT:    v_perm_b32 v1, s2, v1, 0x5040100
1220; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1221; GFX11-NEXT:    s_endpgm
1222  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1223  %tid.ext = sext i32 %tid to i64
1224  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1225  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1226  %vec = load <2 x half>, ptr addrspace(1) %in.gep
1227  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
1228  store <2 x half> %vecins, ptr addrspace(1) %out.gep
1229  ret void
1230}
1231
1232define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1233; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
1234; GFX9:       ; %bb.0:
1235; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1236; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1237; GFX9-NEXT:    v_mov_b32_e32 v2, 0x5040100
1238; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1239; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1240; GFX9-NEXT:    s_waitcnt vmcnt(0)
1241; GFX9-NEXT:    v_perm_b32 v1, 35, v1, v2
1242; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1243; GFX9-NEXT:    s_endpgm
1244;
1245; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
1246; VI:       ; %bb.0:
1247; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1248; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1249; VI-NEXT:    s_waitcnt lgkmcnt(0)
1250; VI-NEXT:    v_mov_b32_e32 v1, s3
1251; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1252; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1253; VI-NEXT:    flat_load_dword v3, v[0:1]
1254; VI-NEXT:    v_mov_b32_e32 v1, s1
1255; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1256; VI-NEXT:    v_mov_b32_e32 v2, 0x230000
1257; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1258; VI-NEXT:    s_waitcnt vmcnt(0)
1259; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1260; VI-NEXT:    flat_store_dword v[0:1], v2
1261; VI-NEXT:    s_endpgm
1262;
1263; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1264; CI:       ; %bb.0:
1265; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1266; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1267; CI-NEXT:    s_waitcnt lgkmcnt(0)
1268; CI-NEXT:    v_mov_b32_e32 v1, s3
1269; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1270; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1271; CI-NEXT:    flat_load_dword v3, v[0:1]
1272; CI-NEXT:    v_mov_b32_e32 v1, s1
1273; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1274; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1275; CI-NEXT:    s_waitcnt vmcnt(0)
1276; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
1277; CI-NEXT:    v_or_b32_e32 v2, 0x230000, v2
1278; CI-NEXT:    flat_store_dword v[0:1], v2
1279; CI-NEXT:    s_endpgm
1280;
1281; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm:
1282; GFX11:       ; %bb.0:
1283; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1284; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1285; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1286; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1287; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1289; GFX11-NEXT:    s_waitcnt vmcnt(0)
1290; GFX11-NEXT:    v_perm_b32 v1, 35, v1, 0x5040100
1291; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1292; GFX11-NEXT:    s_endpgm
1293  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1294  %tid.ext = sext i32 %tid to i64
1295  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1296  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1297  %vec = load <2 x half>, ptr addrspace(1) %in.gep
1298  %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1299  store <2 x half> %vecins, ptr addrspace(1) %out.gep
1300  ret void
1301}
1302
1303; FIXME: Enable for others when argument load not split
1304define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 {
1305; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1306; GFX9:       ; %bb.0:
1307; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1308; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1309; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1310; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1311; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
1312; GFX9-NEXT:    s_load_dword s7, s[2:3], 0x0
1313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1314; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
1315; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1316; GFX9-NEXT:    s_andn2_b32 s3, s7, s2
1317; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
1318; GFX9-NEXT:    s_or_b32 s2, s2, s3
1319; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1320; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1321; GFX9-NEXT:    s_endpgm
1322;
1323; VI-LABEL: s_insertelement_v2i16_dynamic:
1324; VI:       ; %bb.0:
1325; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1326; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1327; VI-NEXT:    s_waitcnt lgkmcnt(0)
1328; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
1329; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1330; VI-NEXT:    v_mov_b32_e32 v0, s0
1331; VI-NEXT:    v_mov_b32_e32 v1, s1
1332; VI-NEXT:    s_waitcnt lgkmcnt(0)
1333; VI-NEXT:    s_lshl_b32 s0, s4, 4
1334; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1335; VI-NEXT:    s_andn2_b32 s1, s2, s0
1336; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1337; VI-NEXT:    s_or_b32 s0, s0, s1
1338; VI-NEXT:    v_mov_b32_e32 v2, s0
1339; VI-NEXT:    flat_store_dword v[0:1], v2
1340; VI-NEXT:    s_endpgm
1341;
1342; CI-LABEL: s_insertelement_v2i16_dynamic:
1343; CI:       ; %bb.0:
1344; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1345; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1346; CI-NEXT:    s_waitcnt lgkmcnt(0)
1347; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
1348; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
1349; CI-NEXT:    v_mov_b32_e32 v0, s0
1350; CI-NEXT:    v_mov_b32_e32 v1, s1
1351; CI-NEXT:    s_waitcnt lgkmcnt(0)
1352; CI-NEXT:    s_lshl_b32 s0, s4, 4
1353; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1354; CI-NEXT:    s_andn2_b32 s1, s2, s0
1355; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1356; CI-NEXT:    s_or_b32 s0, s0, s1
1357; CI-NEXT:    v_mov_b32_e32 v2, s0
1358; CI-NEXT:    flat_store_dword v[0:1], v2
1359; CI-NEXT:    s_endpgm
1360;
1361; GFX11-LABEL: s_insertelement_v2i16_dynamic:
1362; GFX11:       ; %bb.0:
1363; GFX11-NEXT:    s_clause 0x1
1364; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
1365; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1366; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX11-NEXT:    s_load_b32 s4, s[6:7], 0x0
1368; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
1369; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
1371; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1372; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
1373; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
1374; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
1375; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1376; GFX11-NEXT:    s_or_b32 s2, s3, s2
1377; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
1378; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1379; GFX11-NEXT:    s_endpgm
1380  %idx = load volatile i32, ptr addrspace(4) %idx.ptr
1381  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
1382  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1383  store <2 x i16> %vecins, ptr addrspace(1) %out
1384  ret void
1385}
1386
1387define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 {
1388; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1389; GFX9:       ; %bb.0:
1390; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1391; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
1392; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1393; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1394; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1395; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1396; GFX9-NEXT:    s_lshl_b32 s2, s4, 4
1397; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1398; GFX9-NEXT:    s_waitcnt vmcnt(0)
1399; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
1400; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1401; GFX9-NEXT:    s_endpgm
1402;
1403; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1404; VI:       ; %bb.0:
1405; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1406; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1407; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1408; VI-NEXT:    s_waitcnt lgkmcnt(0)
1409; VI-NEXT:    v_mov_b32_e32 v1, s3
1410; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1411; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1412; VI-NEXT:    flat_load_dword v3, v[0:1]
1413; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1414; VI-NEXT:    s_lshl_b32 s0, s4, 4
1415; VI-NEXT:    v_mov_b32_e32 v1, s1
1416; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1417; VI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1418; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1419; VI-NEXT:    s_waitcnt vmcnt(0)
1420; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1421; VI-NEXT:    flat_store_dword v[0:1], v2
1422; VI-NEXT:    s_endpgm
1423;
1424; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1425; CI:       ; %bb.0:
1426; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1427; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
1428; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1429; CI-NEXT:    s_waitcnt lgkmcnt(0)
1430; CI-NEXT:    v_mov_b32_e32 v1, s3
1431; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1432; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1433; CI-NEXT:    flat_load_dword v3, v[0:1]
1434; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1435; CI-NEXT:    s_lshl_b32 s0, s4, 4
1436; CI-NEXT:    v_mov_b32_e32 v1, s1
1437; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1438; CI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1439; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1440; CI-NEXT:    s_waitcnt vmcnt(0)
1441; CI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1442; CI-NEXT:    flat_store_dword v[0:1], v2
1443; CI-NEXT:    s_endpgm
1444;
1445; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1446; GFX11:       ; %bb.0:
1447; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1448; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1449; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
1450; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1451; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1452; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1454; GFX11-NEXT:    s_lshl_b32 s2, s4, 4
1455; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s2
1456; GFX11-NEXT:    s_waitcnt vmcnt(0)
1457; GFX11-NEXT:    v_bfi_b32 v1, s2, 0x3e703e7, v1
1458; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1459; GFX11-NEXT:    s_endpgm
1460  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1461  %tid.ext = sext i32 %tid to i64
1462  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1463  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1464  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
1465  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1466  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
1467  ret void
1468}
1469
1470define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
1471; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1472; GFX9:       ; %bb.0:
1473; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1474; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1475; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1476; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1478; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1479; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1480; GFX9-NEXT:    s_waitcnt vmcnt(1)
1481; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1482; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
1483; GFX9-NEXT:    s_mov_b32 s2, 0x12341234
1484; GFX9-NEXT:    s_waitcnt vmcnt(0)
1485; GFX9-NEXT:    v_bfi_b32 v1, v1, s2, v2
1486; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1487; GFX9-NEXT:    s_endpgm
1488;
1489; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1490; VI:       ; %bb.0:
1491; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1492; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1493; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1494; VI-NEXT:    s_waitcnt lgkmcnt(0)
1495; VI-NEXT:    v_mov_b32_e32 v3, s3
1496; VI-NEXT:    v_mov_b32_e32 v1, s5
1497; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1498; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1499; VI-NEXT:    flat_load_dword v4, v[0:1]
1500; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1501; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1502; VI-NEXT:    flat_load_dword v3, v[0:1]
1503; VI-NEXT:    s_mov_b32 s2, 0xffff
1504; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1505; VI-NEXT:    v_mov_b32_e32 v1, s1
1506; VI-NEXT:    s_mov_b32 s0, 0x12341234
1507; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1508; VI-NEXT:    s_waitcnt vmcnt(1)
1509; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
1510; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
1511; VI-NEXT:    s_waitcnt vmcnt(0)
1512; VI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1513; VI-NEXT:    flat_store_dword v[0:1], v2
1514; VI-NEXT:    s_endpgm
1515;
1516; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1517; CI:       ; %bb.0:
1518; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1519; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1520; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1521; CI-NEXT:    s_waitcnt lgkmcnt(0)
1522; CI-NEXT:    v_mov_b32_e32 v3, s3
1523; CI-NEXT:    v_mov_b32_e32 v1, s5
1524; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
1525; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1526; CI-NEXT:    flat_load_dword v4, v[0:1]
1527; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1528; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1529; CI-NEXT:    flat_load_dword v3, v[0:1]
1530; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1531; CI-NEXT:    v_mov_b32_e32 v1, s1
1532; CI-NEXT:    s_mov_b32 s0, 0x12341234
1533; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1534; CI-NEXT:    s_waitcnt vmcnt(1)
1535; CI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
1536; CI-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
1537; CI-NEXT:    s_waitcnt vmcnt(0)
1538; CI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1539; CI-NEXT:    flat_store_dword v[0:1], v2
1540; CI-NEXT:    s_endpgm
1541;
1542; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1543; GFX11:       ; %bb.0:
1544; GFX11-NEXT:    s_clause 0x1
1545; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
1546; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1547; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1548; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1549; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1550; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1551; GFX11-NEXT:    s_clause 0x1
1552; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
1553; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3]
1554; GFX11-NEXT:    s_waitcnt vmcnt(1)
1555; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1556; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1557; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 0xffff
1558; GFX11-NEXT:    s_waitcnt vmcnt(0)
1559; GFX11-NEXT:    v_bfi_b32 v1, v1, 0x12341234, v2
1560; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1561; GFX11-NEXT:    s_endpgm
1562  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1563  %tid.ext = sext i32 %tid to i64
1564  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1565  %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
1566  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1567  %idx = load i32, ptr addrspace(1) %idx.gep
1568  %vec = load <2 x half>, ptr addrspace(1) %in.gep
1569  %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1570  store <2 x half> %vecins, ptr addrspace(1) %out.gep
1571  ret void
1572}
1573
1574define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
1575; GFX9-LABEL: v_insertelement_v4f16_0:
1576; GFX9:       ; %bb.0:
1577; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1578; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x30
1579; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1580; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1581; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1582; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1583; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1584; GFX9-NEXT:    s_waitcnt vmcnt(0)
1585; GFX9-NEXT:    v_bfi_b32 v0, s2, v3, v0
1586; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1587; GFX9-NEXT:    s_endpgm
1588;
1589; VI-LABEL: v_insertelement_v4f16_0:
1590; VI:       ; %bb.0:
1591; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1592; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
1593; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1594; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
1595; VI-NEXT:    s_waitcnt lgkmcnt(0)
1596; VI-NEXT:    v_mov_b32_e32 v1, s3
1597; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1598; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1599; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1600; VI-NEXT:    v_mov_b32_e32 v3, s1
1601; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1602; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1603; VI-NEXT:    s_waitcnt vmcnt(0)
1604; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
1605; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1606; VI-NEXT:    s_endpgm
1607;
1608; CI-LABEL: v_insertelement_v4f16_0:
1609; CI:       ; %bb.0:
1610; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1611; CI-NEXT:    s_load_dword s4, s[8:9], 0xc
1612; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1613; CI-NEXT:    s_waitcnt lgkmcnt(0)
1614; CI-NEXT:    v_mov_b32_e32 v1, s3
1615; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1616; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1617; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1618; CI-NEXT:    v_mov_b32_e32 v3, s1
1619; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1620; CI-NEXT:    s_mov_b32 s0, 0xffff
1621; CI-NEXT:    v_mov_b32_e32 v4, s4
1622; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1623; CI-NEXT:    s_waitcnt vmcnt(0)
1624; CI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1625; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1626; CI-NEXT:    s_endpgm
1627;
1628; GFX11-LABEL: v_insertelement_v4f16_0:
1629; GFX11:       ; %bb.0:
1630; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1631; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1632; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x30
1633; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1634; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1635; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1637; GFX11-NEXT:    s_waitcnt vmcnt(0)
1638; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, s4, v0
1639; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1640; GFX11-NEXT:    s_endpgm
1641  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1642  %tid.ext = sext i32 %tid to i64
1643  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1644  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1645  %vec = load <4 x half>, ptr addrspace(1) %in.gep
1646  %val.trunc = trunc i32 %val to i16
1647  %val.cvt = bitcast i16 %val.trunc to half
1648  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1649  store <4 x half> %vecins, ptr addrspace(1) %out.gep
1650  ret void
1651}
1652
1653define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1654; GFX9-LABEL: v_insertelement_v4f16_1:
1655; GFX9:       ; %bb.0:
1656; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1657; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
1658; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1659; GFX9-NEXT:    v_mov_b32_e32 v3, 0x5040100
1660; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1661; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1662; GFX9-NEXT:    s_waitcnt vmcnt(0)
1663; GFX9-NEXT:    v_perm_b32 v0, s4, v0, v3
1664; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1665; GFX9-NEXT:    s_endpgm
1666;
1667; VI-LABEL: v_insertelement_v4f16_1:
1668; VI:       ; %bb.0:
1669; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1670; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1671; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1672; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
1673; VI-NEXT:    s_waitcnt lgkmcnt(0)
1674; VI-NEXT:    v_mov_b32_e32 v1, s3
1675; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1676; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1677; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1678; VI-NEXT:    v_mov_b32_e32 v3, s1
1679; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1680; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1681; VI-NEXT:    s_waitcnt vmcnt(0)
1682; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
1683; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1684; VI-NEXT:    s_endpgm
1685;
1686; CI-LABEL: v_insertelement_v4f16_1:
1687; CI:       ; %bb.0:
1688; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1689; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
1690; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1691; CI-NEXT:    s_waitcnt lgkmcnt(0)
1692; CI-NEXT:    v_mov_b32_e32 v1, s3
1693; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1694; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1695; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1696; CI-NEXT:    v_mov_b32_e32 v3, s1
1697; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1698; CI-NEXT:    s_lshl_b32 s0, s4, 16
1699; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1700; CI-NEXT:    s_waitcnt vmcnt(0)
1701; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1702; CI-NEXT:    v_or_b32_e32 v0, s0, v0
1703; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1704; CI-NEXT:    s_endpgm
1705;
1706; GFX11-LABEL: v_insertelement_v4f16_1:
1707; GFX11:       ; %bb.0:
1708; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1709; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1710; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
1711; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1712; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1713; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1714; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1715; GFX11-NEXT:    s_waitcnt vmcnt(0)
1716; GFX11-NEXT:    v_perm_b32 v0, s4, v0, 0x5040100
1717; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1718; GFX11-NEXT:    s_endpgm
1719  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1720  %tid.ext = sext i32 %tid to i64
1721  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1722  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1723  %vec = load <4 x half>, ptr addrspace(1) %in.gep
1724  %val.trunc = trunc i32 %val to i16
1725  %val.cvt = bitcast i16 %val.trunc to half
1726  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1727  store <4 x half> %vecins, ptr addrspace(1) %out.gep
1728  ret void
1729}
1730
1731define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
1732; GFX9-LABEL: v_insertelement_v4f16_2:
1733; GFX9:       ; %bb.0:
1734; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1735; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x30
1736; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1739; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1740; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1741; GFX9-NEXT:    s_waitcnt vmcnt(0)
1742; GFX9-NEXT:    v_bfi_b32 v1, s2, v3, v1
1743; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1744; GFX9-NEXT:    s_endpgm
1745;
1746; VI-LABEL: v_insertelement_v4f16_2:
1747; VI:       ; %bb.0:
1748; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1749; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
1750; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1751; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
1752; VI-NEXT:    s_waitcnt lgkmcnt(0)
1753; VI-NEXT:    v_mov_b32_e32 v1, s3
1754; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1755; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1756; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1757; VI-NEXT:    v_mov_b32_e32 v3, s1
1758; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1759; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1760; VI-NEXT:    s_waitcnt vmcnt(0)
1761; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
1762; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1763; VI-NEXT:    s_endpgm
1764;
1765; CI-LABEL: v_insertelement_v4f16_2:
1766; CI:       ; %bb.0:
1767; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1768; CI-NEXT:    s_load_dword s4, s[8:9], 0xc
1769; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1770; CI-NEXT:    s_waitcnt lgkmcnt(0)
1771; CI-NEXT:    v_mov_b32_e32 v1, s3
1772; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1773; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1774; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1775; CI-NEXT:    v_mov_b32_e32 v3, s1
1776; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1777; CI-NEXT:    s_mov_b32 s0, 0xffff
1778; CI-NEXT:    v_mov_b32_e32 v4, s4
1779; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1780; CI-NEXT:    s_waitcnt vmcnt(0)
1781; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1782; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1783; CI-NEXT:    s_endpgm
1784;
1785; GFX11-LABEL: v_insertelement_v4f16_2:
1786; GFX11:       ; %bb.0:
1787; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1788; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1789; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x30
1790; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1791; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1792; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1794; GFX11-NEXT:    s_waitcnt vmcnt(0)
1795; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s4, v1
1796; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1797; GFX11-NEXT:    s_endpgm
1798  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1799  %tid.ext = sext i32 %tid to i64
1800  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1801  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1802  %vec = load <4 x half>, ptr addrspace(1) %in.gep
1803  %val.trunc = trunc i32 %val to i16
1804  %val.cvt = bitcast i16 %val.trunc to half
1805  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1806  store <4 x half> %vecins, ptr addrspace(1) %out.gep
1807  ret void
1808}
1809
1810define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1811; GFX9-LABEL: v_insertelement_v4f16_3:
1812; GFX9:       ; %bb.0:
1813; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1814; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
1815; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1816; GFX9-NEXT:    v_mov_b32_e32 v3, 0x5040100
1817; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1818; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1819; GFX9-NEXT:    s_waitcnt vmcnt(0)
1820; GFX9-NEXT:    v_perm_b32 v1, s4, v1, v3
1821; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1822; GFX9-NEXT:    s_endpgm
1823;
1824; VI-LABEL: v_insertelement_v4f16_3:
1825; VI:       ; %bb.0:
1826; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1827; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1828; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1829; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
1830; VI-NEXT:    s_waitcnt lgkmcnt(0)
1831; VI-NEXT:    v_mov_b32_e32 v1, s3
1832; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1833; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1834; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1835; VI-NEXT:    v_mov_b32_e32 v3, s1
1836; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1837; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1838; VI-NEXT:    s_waitcnt vmcnt(0)
1839; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
1840; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1841; VI-NEXT:    s_endpgm
1842;
1843; CI-LABEL: v_insertelement_v4f16_3:
1844; CI:       ; %bb.0:
1845; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1846; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
1847; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1848; CI-NEXT:    s_waitcnt lgkmcnt(0)
1849; CI-NEXT:    v_mov_b32_e32 v1, s3
1850; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1851; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1852; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1853; CI-NEXT:    v_mov_b32_e32 v3, s1
1854; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1855; CI-NEXT:    s_lshl_b32 s0, s4, 16
1856; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1857; CI-NEXT:    s_waitcnt vmcnt(0)
1858; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1859; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1860; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1861; CI-NEXT:    s_endpgm
1862;
1863; GFX11-LABEL: v_insertelement_v4f16_3:
1864; GFX11:       ; %bb.0:
1865; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1866; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1867; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
1868; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1869; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1870; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1872; GFX11-NEXT:    s_waitcnt vmcnt(0)
1873; GFX11-NEXT:    v_perm_b32 v1, s4, v1, 0x5040100
1874; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1875; GFX11-NEXT:    s_endpgm
1876  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1877  %tid.ext = sext i32 %tid to i64
1878  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1879  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1880  %vec = load <4 x half>, ptr addrspace(1) %in.gep
1881  %val.trunc = trunc i32 %val to i16
1882  %val.cvt = bitcast i16 %val.trunc to half
1883  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1884  store <4 x half> %vecins, ptr addrspace(1) %out.gep
1885  ret void
1886}
1887
1888define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1889; GFX9-LABEL: v_insertelement_v4i16_2:
1890; GFX9:       ; %bb.0:
1891; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1892; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
1893; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1894; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1895; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1896; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1897; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1898; GFX9-NEXT:    s_waitcnt vmcnt(0)
1899; GFX9-NEXT:    v_bfi_b32 v1, s2, v3, v1
1900; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1901; GFX9-NEXT:    s_endpgm
1902;
1903; VI-LABEL: v_insertelement_v4i16_2:
1904; VI:       ; %bb.0:
1905; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1906; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1907; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1908; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
1909; VI-NEXT:    s_waitcnt lgkmcnt(0)
1910; VI-NEXT:    v_mov_b32_e32 v1, s3
1911; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1912; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1913; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1914; VI-NEXT:    v_mov_b32_e32 v3, s1
1915; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1916; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1917; VI-NEXT:    s_waitcnt vmcnt(0)
1918; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
1919; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1920; VI-NEXT:    s_endpgm
1921;
1922; CI-LABEL: v_insertelement_v4i16_2:
1923; CI:       ; %bb.0:
1924; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1925; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
1926; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1927; CI-NEXT:    s_waitcnt lgkmcnt(0)
1928; CI-NEXT:    v_mov_b32_e32 v1, s3
1929; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1930; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1931; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1932; CI-NEXT:    v_mov_b32_e32 v3, s1
1933; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1934; CI-NEXT:    s_mov_b32 s0, 0xffff
1935; CI-NEXT:    v_mov_b32_e32 v4, s4
1936; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1937; CI-NEXT:    s_waitcnt vmcnt(0)
1938; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1939; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1940; CI-NEXT:    s_endpgm
1941;
1942; GFX11-LABEL: v_insertelement_v4i16_2:
1943; GFX11:       ; %bb.0:
1944; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1945; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1946; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
1947; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1948; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1949; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1950; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1951; GFX11-NEXT:    s_waitcnt vmcnt(0)
1952; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s4, v1
1953; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1954; GFX11-NEXT:    s_endpgm
1955  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1956  %tid.ext = sext i32 %tid to i64
1957  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1958  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1959  %vec = load <4 x i16>, ptr addrspace(1) %in.gep
1960  %val.trunc = trunc i32 %val to i16
1961  %val.cvt = bitcast i16 %val.trunc to i16
1962  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1963  store <4 x i16> %vecins, ptr addrspace(1) %out.gep
1964  ret void
1965}
1966
1967; FIXME: Better code on CI?
1968define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1969; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1970; GFX9:       ; %bb.0:
1971; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1972; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
1973; GFX9-NEXT:    global_load_dword v2, v[0:1], off glc
1974; GFX9-NEXT:    s_waitcnt vmcnt(0)
1975; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1976; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1977; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1978; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
1979; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1980; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
1981; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
1982; GFX9-NEXT:    s_waitcnt vmcnt(0)
1983; GFX9-NEXT:    v_bfi_b32 v1, v3, s2, v1
1984; GFX9-NEXT:    v_bfi_b32 v0, v2, s2, v0
1985; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1986; GFX9-NEXT:    s_endpgm
1987;
1988; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1989; VI:       ; %bb.0:
1990; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1991; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1992; VI-NEXT:    flat_load_dword v4, v[0:1] glc
1993; VI-NEXT:    s_waitcnt vmcnt(0)
1994; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1995; VI-NEXT:    s_waitcnt lgkmcnt(0)
1996; VI-NEXT:    v_mov_b32_e32 v1, s3
1997; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1998; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1999; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2000; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
2001; VI-NEXT:    v_mov_b32_e32 v3, s1
2002; VI-NEXT:    s_lshl_b32 s1, s4, 16
2003; VI-NEXT:    s_and_b32 s4, s4, 0xffff
2004; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
2005; VI-NEXT:    s_or_b32 s0, s4, s1
2006; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2007; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
2008; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
2009; VI-NEXT:    s_waitcnt vmcnt(0)
2010; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
2011; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
2012; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2013; VI-NEXT:    s_endpgm
2014;
2015; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2016; CI:       ; %bb.0:
2017; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2018; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
2019; CI-NEXT:    flat_load_dword v4, v[0:1] glc
2020; CI-NEXT:    s_waitcnt vmcnt(0)
2021; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2022; CI-NEXT:    s_waitcnt lgkmcnt(0)
2023; CI-NEXT:    v_mov_b32_e32 v1, s3
2024; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2025; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2026; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2027; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
2028; CI-NEXT:    v_mov_b32_e32 v3, s1
2029; CI-NEXT:    s_lshl_b32 s1, s4, 16
2030; CI-NEXT:    s_and_b32 s4, s4, 0xffff
2031; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
2032; CI-NEXT:    s_or_b32 s0, s4, s1
2033; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2034; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
2035; CI-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
2036; CI-NEXT:    s_waitcnt vmcnt(0)
2037; CI-NEXT:    v_bfi_b32 v1, v5, s0, v1
2038; CI-NEXT:    v_bfi_b32 v0, v4, s0, v0
2039; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2040; CI-NEXT:    s_endpgm
2041;
2042; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2043; GFX11:       ; %bb.0:
2044; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2045; GFX11-NEXT:    global_load_b32 v2, v[0:1], off glc dlc
2046; GFX11-NEXT:    s_waitcnt vmcnt(0)
2047; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2048; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
2049; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2050; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
2051; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2052; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
2053; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
2054; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
2055; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 0xffff
2056; GFX11-NEXT:    s_waitcnt vmcnt(0)
2057; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2058; GFX11-NEXT:    v_bfi_b32 v1, v3, s2, v1
2059; GFX11-NEXT:    v_bfi_b32 v0, v2, s2, v0
2060; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
2061; GFX11-NEXT:    s_endpgm
2062  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2063  %tid.ext = sext i32 %tid to i64
2064  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2065  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2066  %idx.val = load volatile i32, ptr addrspace(1) undef
2067  %vec = load <4 x i16>, ptr addrspace(1) %in.gep
2068  %val.trunc = trunc i32 %val to i16
2069  %val.cvt = bitcast i16 %val.trunc to i16
2070  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
2071  store <4 x i16> %vecins, ptr addrspace(1) %out.gep
2072  ret void
2073}
2074
2075define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 {
2076; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2077; GFX9:       ; %bb.0:
2078; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2079; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2080; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2081; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2082; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
2083; GFX9-NEXT:    s_lshl_b32 s2, s5, 4
2084; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
2085; GFX9-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
2086; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2087; GFX9-NEXT:    v_mov_b32_e32 v4, s4
2088; GFX9-NEXT:    s_waitcnt vmcnt(0)
2089; GFX9-NEXT:    v_bfi_b32 v1, s3, v3, v1
2090; GFX9-NEXT:    v_bfi_b32 v0, s2, v4, v0
2091; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2092; GFX9-NEXT:    s_endpgm
2093;
2094; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2095; VI:       ; %bb.0:
2096; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2097; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2098; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2099; VI-NEXT:    s_waitcnt lgkmcnt(0)
2100; VI-NEXT:    v_mov_b32_e32 v1, s3
2101; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2102; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2103; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2104; VI-NEXT:    v_mov_b32_e32 v3, s1
2105; VI-NEXT:    s_lshl_b32 s1, s4, 16
2106; VI-NEXT:    s_and_b32 s2, s4, 0xffff
2107; VI-NEXT:    s_lshl_b32 s3, s5, 4
2108; VI-NEXT:    s_or_b32 s2, s2, s1
2109; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
2110; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
2111; VI-NEXT:    v_mov_b32_e32 v4, s2
2112; VI-NEXT:    v_mov_b32_e32 v5, s2
2113; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2114; VI-NEXT:    s_waitcnt vmcnt(0)
2115; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
2116; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
2117; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2118; VI-NEXT:    s_endpgm
2119;
2120; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2121; CI:       ; %bb.0:
2122; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2123; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
2124; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2125; CI-NEXT:    s_waitcnt lgkmcnt(0)
2126; CI-NEXT:    v_mov_b32_e32 v1, s3
2127; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2128; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2129; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2130; CI-NEXT:    v_mov_b32_e32 v3, s1
2131; CI-NEXT:    s_and_b32 s1, s4, 0xffff
2132; CI-NEXT:    s_lshl_b32 s2, s4, 16
2133; CI-NEXT:    s_lshl_b32 s3, s5, 4
2134; CI-NEXT:    s_or_b32 s2, s1, s2
2135; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
2136; CI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
2137; CI-NEXT:    v_mov_b32_e32 v4, s2
2138; CI-NEXT:    v_mov_b32_e32 v5, s2
2139; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2140; CI-NEXT:    s_waitcnt vmcnt(0)
2141; CI-NEXT:    v_bfi_b32 v1, s1, v4, v1
2142; CI-NEXT:    v_bfi_b32 v0, s0, v5, v0
2143; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2144; CI-NEXT:    s_endpgm
2145;
2146; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2147; GFX11:       ; %bb.0:
2148; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2149; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2150; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
2151; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2152; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2153; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2154; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
2155; GFX11-NEXT:    s_lshl_b32 s2, s5, 4
2156; GFX11-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
2157; GFX11-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
2158; GFX11-NEXT:    s_waitcnt vmcnt(0)
2159; GFX11-NEXT:    v_bfi_b32 v1, s3, s4, v1
2160; GFX11-NEXT:    v_bfi_b32 v0, s2, s4, v0
2161; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
2162; GFX11-NEXT:    s_endpgm
2163  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2164  %tid.ext = sext i32 %tid to i64
2165  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
2166  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
2167  %vec = load <4 x half>, ptr addrspace(1) %in.gep
2168  %val.trunc = trunc i32 %val to i16
2169  %val.cvt = bitcast i16 %val.trunc to half
2170  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
2171  store <4 x half> %vecins, ptr addrspace(1) %out.gep
2172  ret void
2173}
2174
2175define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2176; GFX9-LABEL: v_insertelement_v8f16_3:
2177; GFX9:       ; %bb.0:
2178; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2179; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
2180; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2181; GFX9-NEXT:    v_mov_b32_e32 v5, 0x5040100
2182; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2183; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
2184; GFX9-NEXT:    s_waitcnt vmcnt(0)
2185; GFX9-NEXT:    v_perm_b32 v1, s4, v1, v5
2186; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2187; GFX9-NEXT:    s_endpgm
2188;
2189; VI-LABEL: v_insertelement_v8f16_3:
2190; VI:       ; %bb.0:
2191; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2192; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
2193; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2194; VI-NEXT:    s_waitcnt lgkmcnt(0)
2195; VI-NEXT:    v_mov_b32_e32 v1, s3
2196; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2197; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2198; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2199; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
2200; VI-NEXT:    s_lshl_b32 s0, s4, 16
2201; VI-NEXT:    v_mov_b32_e32 v5, s1
2202; VI-NEXT:    v_mov_b32_e32 v6, s0
2203; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2204; VI-NEXT:    s_waitcnt vmcnt(0)
2205; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2206; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2207; VI-NEXT:    s_endpgm
2208;
2209; CI-LABEL: v_insertelement_v8f16_3:
2210; CI:       ; %bb.0:
2211; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2212; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
2213; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2214; CI-NEXT:    s_waitcnt lgkmcnt(0)
2215; CI-NEXT:    v_mov_b32_e32 v1, s3
2216; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2217; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2218; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2219; CI-NEXT:    v_mov_b32_e32 v5, s1
2220; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2221; CI-NEXT:    s_lshl_b32 s0, s4, 16
2222; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2223; CI-NEXT:    s_waitcnt vmcnt(0)
2224; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2225; CI-NEXT:    v_or_b32_e32 v1, s0, v1
2226; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2227; CI-NEXT:    s_endpgm
2228;
2229; GFX11-LABEL: v_insertelement_v8f16_3:
2230; GFX11:       ; %bb.0:
2231; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2232; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2233; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
2234; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2235; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2236; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2237; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
2238; GFX11-NEXT:    s_waitcnt vmcnt(0)
2239; GFX11-NEXT:    v_perm_b32 v1, s4, v1, 0x5040100
2240; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
2241; GFX11-NEXT:    s_endpgm
2242  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2243  %tid.ext = sext i32 %tid to i64
2244  %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
2245  %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
2246  %vec = load <8 x half>, ptr addrspace(1) %in.gep
2247  %val.trunc = trunc i32 %val to i16
2248  %val.cvt = bitcast i16 %val.trunc to half
2249  %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
2250  store <8 x half> %vecins, ptr addrspace(1) %out.gep
2251  ret void
2252}
2253
2254define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2255; GFX9-LABEL: v_insertelement_v8i16_6:
2256; GFX9:       ; %bb.0:
2257; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2258; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
2259; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
2262; GFX9-NEXT:    s_mov_b32 s2, 0xffff
2263; GFX9-NEXT:    v_mov_b32_e32 v5, s4
2264; GFX9-NEXT:    s_waitcnt vmcnt(0)
2265; GFX9-NEXT:    v_bfi_b32 v3, s2, v5, v3
2266; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2267; GFX9-NEXT:    s_endpgm
2268;
2269; VI-LABEL: v_insertelement_v8i16_6:
2270; VI:       ; %bb.0:
2271; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2272; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
2273; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2274; VI-NEXT:    s_waitcnt lgkmcnt(0)
2275; VI-NEXT:    v_mov_b32_e32 v1, s3
2276; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2277; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2278; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2279; VI-NEXT:    v_mov_b32_e32 v5, s1
2280; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
2281; VI-NEXT:    s_mov_b32 s0, 0xffff
2282; VI-NEXT:    v_mov_b32_e32 v6, s4
2283; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2284; VI-NEXT:    s_waitcnt vmcnt(0)
2285; VI-NEXT:    v_bfi_b32 v3, s0, v6, v3
2286; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2287; VI-NEXT:    s_endpgm
2288;
2289; CI-LABEL: v_insertelement_v8i16_6:
2290; CI:       ; %bb.0:
2291; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2292; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
2293; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2294; CI-NEXT:    s_waitcnt lgkmcnt(0)
2295; CI-NEXT:    v_mov_b32_e32 v1, s3
2296; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2297; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2298; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2299; CI-NEXT:    v_mov_b32_e32 v5, s1
2300; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2301; CI-NEXT:    s_mov_b32 s0, 0xffff
2302; CI-NEXT:    v_mov_b32_e32 v6, s4
2303; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2304; CI-NEXT:    s_waitcnt vmcnt(0)
2305; CI-NEXT:    v_bfi_b32 v3, s0, v6, v3
2306; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2307; CI-NEXT:    s_endpgm
2308;
2309; GFX11-LABEL: v_insertelement_v8i16_6:
2310; GFX11:       ; %bb.0:
2311; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2312; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2313; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
2314; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2315; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2316; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
2318; GFX11-NEXT:    s_waitcnt vmcnt(0)
2319; GFX11-NEXT:    v_bfi_b32 v3, 0xffff, s4, v3
2320; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
2321; GFX11-NEXT:    s_endpgm
2322  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2323  %tid.ext = sext i32 %tid to i64
2324  %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2325  %out.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2326  %vec = load <8 x i16>, ptr addrspace(1) %in.gep
2327  %val.trunc = trunc i32 %val to i16
2328  %val.cvt = bitcast i16 %val.trunc to i16
2329  %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
2330  store <8 x i16> %vecins, ptr addrspace(1) %out.gep
2331  ret void
2332}
2333
2334define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
2335; GFX9-LABEL: v_insertelement_v8f16_dynamic:
2336; GFX9:       ; %bb.0:
2337; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2338; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2339; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2340; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2341; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
2342; GFX9-NEXT:    s_cmp_eq_u32 s5, 6
2343; GFX9-NEXT:    v_mov_b32_e32 v5, s4
2344; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2345; GFX9-NEXT:    s_cmp_eq_u32 s5, 7
2346; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
2347; GFX9-NEXT:    s_waitcnt vmcnt(0)
2348; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v5, vcc
2349; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2350; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2351; GFX9-NEXT:    s_cmp_eq_u32 s5, 4
2352; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2353; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2354; GFX9-NEXT:    s_cmp_eq_u32 s5, 5
2355; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2356; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2357; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2358; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
2359; GFX9-NEXT:    v_perm_b32 v3, v3, v6, s2
2360; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v5, vcc
2361; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2362; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
2363; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
2364; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2365; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2366; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
2367; GFX9-NEXT:    v_perm_b32 v2, v6, v2, s2
2368; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
2369; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2370; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
2371; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2372; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
2373; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2374; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
2375; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s2
2376; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s2
2377; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2378; GFX9-NEXT:    s_endpgm
2379;
2380; VI-LABEL: v_insertelement_v8f16_dynamic:
2381; VI:       ; %bb.0:
2382; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2383; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2384; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2385; VI-NEXT:    s_waitcnt lgkmcnt(0)
2386; VI-NEXT:    v_mov_b32_e32 v1, s3
2387; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2388; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2389; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2390; VI-NEXT:    v_mov_b32_e32 v5, s1
2391; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
2392; VI-NEXT:    s_cmp_eq_u32 s5, 6
2393; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2394; VI-NEXT:    v_mov_b32_e32 v6, s4
2395; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2396; VI-NEXT:    s_cmp_eq_u32 s5, 7
2397; VI-NEXT:    s_waitcnt vmcnt(0)
2398; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
2399; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2400; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2401; VI-NEXT:    s_cmp_eq_u32 s5, 4
2402; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2403; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2404; VI-NEXT:    s_cmp_eq_u32 s5, 5
2405; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2406; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2407; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2408; VI-NEXT:    s_cmp_eq_u32 s5, 2
2409; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2410; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
2411; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2412; VI-NEXT:    s_cmp_eq_u32 s5, 3
2413; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2414; VI-NEXT:    v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2415; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2416; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2417; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2418; VI-NEXT:    s_cmp_eq_u32 s5, 0
2419; VI-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2420; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
2421; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2422; VI-NEXT:    s_cmp_eq_u32 s5, 1
2423; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
2424; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2425; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2426; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
2427; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2428; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2429; VI-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2430; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2431; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2432; VI-NEXT:    s_endpgm
2433;
2434; CI-LABEL: v_insertelement_v8f16_dynamic:
2435; CI:       ; %bb.0:
2436; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2437; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
2438; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2439; CI-NEXT:    s_waitcnt lgkmcnt(0)
2440; CI-NEXT:    v_mov_b32_e32 v1, s3
2441; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2442; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2443; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2444; CI-NEXT:    v_mov_b32_e32 v5, s1
2445; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2446; CI-NEXT:    v_cvt_f32_f16_e32 v6, s4
2447; CI-NEXT:    s_cmp_eq_u32 s5, 7
2448; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2449; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2450; CI-NEXT:    s_cmp_eq_u32 s5, 6
2451; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2452; CI-NEXT:    s_cmp_eq_u32 s5, 5
2453; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2454; CI-NEXT:    s_cmp_eq_u32 s5, 4
2455; CI-NEXT:    s_waitcnt vmcnt(0)
2456; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
2457; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2458; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2459; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2460; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
2461; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2462; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2463; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
2464; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
2465; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
2466; CI-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2467; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2468; CI-NEXT:    s_cmp_eq_u32 s5, 3
2469; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2470; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2471; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
2472; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2473; CI-NEXT:    s_cmp_eq_u32 s5, 2
2474; CI-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc
2475; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2476; CI-NEXT:    s_cmp_eq_u32 s5, 1
2477; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2478; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2479; CI-NEXT:    s_cmp_eq_u32 s5, 0
2480; CI-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[2:3]
2481; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2482; CI-NEXT:    v_cndmask_b32_e32 v10, v10, v6, vcc
2483; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2484; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2485; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2486; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2487; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2488; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2489; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2490; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2491; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2492; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2493; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
2494; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2495; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2496; CI-NEXT:    v_or_b32_e32 v3, v3, v6
2497; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
2498; CI-NEXT:    v_or_b32_e32 v2, v2, v7
2499; CI-NEXT:    v_or_b32_e32 v1, v1, v8
2500; CI-NEXT:    v_or_b32_e32 v0, v0, v6
2501; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2502; CI-NEXT:    s_endpgm
2503;
2504; GFX11-LABEL: v_insertelement_v8f16_dynamic:
2505; GFX11:       ; %bb.0:
2506; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2507; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2508; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
2509; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2510; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
2511; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
2513; GFX11-NEXT:    s_cmp_eq_u32 s5, 6
2514; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2515; GFX11-NEXT:    s_cmp_eq_u32 s5, 7
2516; GFX11-NEXT:    s_waitcnt vmcnt(0)
2517; GFX11-NEXT:    v_cndmask_b32_e64 v5, v3, s4, s2
2518; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2519; GFX11-NEXT:    s_cmp_eq_u32 s5, 4
2520; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2521; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
2522; GFX11-NEXT:    s_cmp_eq_u32 s5, 5
2523; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2524; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s3
2525; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
2526; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
2527; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
2528; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s4, s2
2529; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2530; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
2531; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s2
2532; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2533; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
2534; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
2535; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s4, s2
2536; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2537; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
2538; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s2
2539; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
2540; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s4, s3
2541; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, s4, s2
2542; GFX11-NEXT:    v_perm_b32 v3, v3, v5, 0x5040100
2543; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
2544; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2545; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
2546; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
2547; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
2548; GFX11-NEXT:    s_endpgm
2549  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2550  %tid.ext = sext i32 %tid to i64
2551  %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
2552  %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
2553  %vec = load <8 x half>, ptr addrspace(1) %in.gep
2554  %val.trunc = trunc i32 %val to i16
2555  %val.cvt = bitcast i16 %val.trunc to half
2556  %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
2557  store <8 x half> %vecins, ptr addrspace(1) %out.gep
2558  ret void
2559}
2560
2561define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2562; GFX9-LABEL: v_insertelement_v16f16_3:
2563; GFX9:       ; %bb.0:
2564; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2565; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
2566; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2567; GFX9-NEXT:    v_mov_b32_e32 v9, 0x5040100
2568; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2569; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
2570; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2571; GFX9-NEXT:    s_waitcnt vmcnt(1)
2572; GFX9-NEXT:    v_perm_b32 v1, s4, v1, v9
2573; GFX9-NEXT:    s_waitcnt vmcnt(0)
2574; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2575; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
2576; GFX9-NEXT:    s_endpgm
2577;
2578; VI-LABEL: v_insertelement_v16f16_3:
2579; VI:       ; %bb.0:
2580; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2581; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
2582; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2583; VI-NEXT:    s_waitcnt lgkmcnt(0)
2584; VI-NEXT:    v_mov_b32_e32 v1, s3
2585; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
2586; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2587; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
2588; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2589; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2590; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2591; VI-NEXT:    v_mov_b32_e32 v9, s1
2592; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
2593; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2594; VI-NEXT:    s_lshl_b32 s1, s4, 16
2595; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
2596; VI-NEXT:    v_mov_b32_e32 v12, s1
2597; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2598; VI-NEXT:    s_waitcnt vmcnt(1)
2599; VI-NEXT:    v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2600; VI-NEXT:    s_waitcnt vmcnt(0)
2601; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2602; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2603; VI-NEXT:    s_endpgm
2604;
2605; CI-LABEL: v_insertelement_v16f16_3:
2606; CI:       ; %bb.0:
2607; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2608; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
2609; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2610; CI-NEXT:    s_waitcnt lgkmcnt(0)
2611; CI-NEXT:    v_mov_b32_e32 v0, s3
2612; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
2613; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
2614; CI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
2615; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
2616; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2617; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2618; CI-NEXT:    v_mov_b32_e32 v9, s1
2619; CI-NEXT:    v_add_i32_e32 v8, vcc, s0, v8
2620; CI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2621; CI-NEXT:    v_add_i32_e32 v10, vcc, 16, v8
2622; CI-NEXT:    s_lshl_b32 s1, s4, 16
2623; CI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2624; CI-NEXT:    s_waitcnt vmcnt(1)
2625; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2626; CI-NEXT:    v_or_b32_e32 v1, s1, v1
2627; CI-NEXT:    s_waitcnt vmcnt(0)
2628; CI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2629; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2630; CI-NEXT:    s_endpgm
2631;
2632; GFX11-LABEL: v_insertelement_v16f16_3:
2633; GFX11:       ; %bb.0:
2634; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2635; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2636; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
2637; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2638; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2639; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2640; GFX11-NEXT:    s_clause 0x1
2641; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
2642; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[2:3] offset:16
2643; GFX11-NEXT:    s_waitcnt vmcnt(1)
2644; GFX11-NEXT:    v_perm_b32 v1, s4, v1, 0x5040100
2645; GFX11-NEXT:    s_waitcnt vmcnt(0)
2646; GFX11-NEXT:    s_clause 0x1
2647; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
2648; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
2649; GFX11-NEXT:    s_endpgm
2650  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2651  %tid.ext = sext i32 %tid to i64
2652  %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
2653  %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
2654  %vec = load <16 x half>, ptr addrspace(1) %in.gep
2655  %val.trunc = trunc i32 %val to i16
2656  %val.cvt = bitcast i16 %val.trunc to half
2657  %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3
2658  store <16 x half> %vecins, ptr addrspace(1) %out.gep
2659  ret void
2660}
2661
2662define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2663; GFX9-LABEL: v_insertelement_v16i16_6:
2664; GFX9:       ; %bb.0:
2665; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2666; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
2667; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2668; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2669; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
2670; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2671; GFX9-NEXT:    s_mov_b32 s2, 0xffff
2672; GFX9-NEXT:    v_mov_b32_e32 v9, s4
2673; GFX9-NEXT:    s_waitcnt vmcnt(1)
2674; GFX9-NEXT:    v_bfi_b32 v3, s2, v9, v3
2675; GFX9-NEXT:    s_waitcnt vmcnt(0)
2676; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2677; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
2678; GFX9-NEXT:    s_endpgm
2679;
2680; VI-LABEL: v_insertelement_v16i16_6:
2681; VI:       ; %bb.0:
2682; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2683; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
2684; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2685; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
2686; VI-NEXT:    s_waitcnt lgkmcnt(0)
2687; VI-NEXT:    v_mov_b32_e32 v1, s3
2688; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
2689; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2690; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
2691; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2692; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2693; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2694; VI-NEXT:    v_mov_b32_e32 v9, s1
2695; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
2696; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2697; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
2698; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2699; VI-NEXT:    s_waitcnt vmcnt(1)
2700; VI-NEXT:    v_perm_b32 v3, s4, v3, v12
2701; VI-NEXT:    s_waitcnt vmcnt(0)
2702; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2703; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2704; VI-NEXT:    s_endpgm
2705;
2706; CI-LABEL: v_insertelement_v16i16_6:
2707; CI:       ; %bb.0:
2708; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2709; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
2710; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2711; CI-NEXT:    s_waitcnt lgkmcnt(0)
2712; CI-NEXT:    v_mov_b32_e32 v1, s3
2713; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v8
2714; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2715; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
2716; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2717; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2718; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2719; CI-NEXT:    v_mov_b32_e32 v9, s1
2720; CI-NEXT:    v_add_i32_e32 v8, vcc, s0, v8
2721; CI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2722; CI-NEXT:    v_add_i32_e32 v10, vcc, 16, v8
2723; CI-NEXT:    s_mov_b32 s2, 0xffff
2724; CI-NEXT:    v_mov_b32_e32 v12, s4
2725; CI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2726; CI-NEXT:    s_waitcnt vmcnt(1)
2727; CI-NEXT:    v_bfi_b32 v3, s2, v12, v3
2728; CI-NEXT:    s_waitcnt vmcnt(0)
2729; CI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2730; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2731; CI-NEXT:    s_endpgm
2732;
2733; GFX11-LABEL: v_insertelement_v16i16_6:
2734; GFX11:       ; %bb.0:
2735; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2736; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2737; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
2738; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2739; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2740; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2741; GFX11-NEXT:    s_clause 0x1
2742; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
2743; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[2:3] offset:16
2744; GFX11-NEXT:    s_waitcnt vmcnt(1)
2745; GFX11-NEXT:    v_bfi_b32 v3, 0xffff, s4, v3
2746; GFX11-NEXT:    s_waitcnt vmcnt(0)
2747; GFX11-NEXT:    s_clause 0x1
2748; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
2749; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
2750; GFX11-NEXT:    s_endpgm
2751  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2752  %tid.ext = sext i32 %tid to i64
2753  %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2754  %out.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2755  %vec = load <16 x i16>, ptr addrspace(1) %in.gep
2756  %val.trunc = trunc i32 %val to i16
2757  %val.cvt = bitcast i16 %val.trunc to i16
2758  %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6
2759  store <16 x i16> %vecins, ptr addrspace(1) %out.gep
2760  ret void
2761}
2762
2763define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
2764; GFX9-LABEL: v_insertelement_v16f16_dynamic:
2765; GFX9:       ; %bb.0:
2766; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2767; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2768; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
2769; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2770; GFX9-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
2771; GFX9-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
2772; GFX9-NEXT:    s_cmp_eq_u32 s5, 6
2773; GFX9-NEXT:    v_mov_b32_e32 v9, s4
2774; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2775; GFX9-NEXT:    s_cmp_eq_u32 s5, 7
2776; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
2777; GFX9-NEXT:    s_waitcnt vmcnt(1)
2778; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v9, vcc
2779; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
2780; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2781; GFX9-NEXT:    s_cmp_eq_u32 s5, 4
2782; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
2783; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2784; GFX9-NEXT:    s_cmp_eq_u32 s5, 5
2785; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
2786; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
2787; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2788; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
2789; GFX9-NEXT:    v_perm_b32 v4, v4, v10, s2
2790; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc
2791; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2792; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
2793; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
2794; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
2795; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2796; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
2797; GFX9-NEXT:    v_cndmask_b32_e32 v11, v12, v9, vcc
2798; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2799; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
2800; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
2801; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
2802; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2803; GFX9-NEXT:    s_cmp_eq_u32 s5, 14
2804; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v9, vcc
2805; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2806; GFX9-NEXT:    s_cmp_eq_u32 s5, 15
2807; GFX9-NEXT:    s_waitcnt vmcnt(0)
2808; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
2809; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
2810; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2811; GFX9-NEXT:    s_cmp_eq_u32 s5, 12
2812; GFX9-NEXT:    v_perm_b32 v1, v12, v1, s2
2813; GFX9-NEXT:    v_cndmask_b32_e32 v12, v14, v9, vcc
2814; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2815; GFX9-NEXT:    s_cmp_eq_u32 s5, 13
2816; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
2817; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
2818; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2819; GFX9-NEXT:    s_cmp_eq_u32 s5, 10
2820; GFX9-NEXT:    v_perm_b32 v8, v12, v8, s2
2821; GFX9-NEXT:    v_cndmask_b32_e32 v12, v15, v9, vcc
2822; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2823; GFX9-NEXT:    s_cmp_eq_u32 s5, 11
2824; GFX9-NEXT:    v_perm_b32 v3, v10, v3, s2
2825; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
2826; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
2827; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2828; GFX9-NEXT:    s_cmp_eq_u32 s5, 8
2829; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
2830; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2831; GFX9-NEXT:    s_cmp_eq_u32 s5, 9
2832; GFX9-NEXT:    v_perm_b32 v2, v11, v2, s2
2833; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
2834; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
2835; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2836; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
2837; GFX9-NEXT:    v_perm_b32 v7, v12, v7, s2
2838; GFX9-NEXT:    v_perm_b32 v6, v10, v6, s2
2839; GFX9-NEXT:    v_perm_b32 v5, v9, v5, s2
2840; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
2841; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
2842; GFX9-NEXT:    s_endpgm
2843;
2844; VI-LABEL: v_insertelement_v16f16_dynamic:
2845; VI:       ; %bb.0:
2846; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2847; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x10
2848; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
2849; VI-NEXT:    s_waitcnt lgkmcnt(0)
2850; VI-NEXT:    v_mov_b32_e32 v0, s3
2851; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
2852; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
2853; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
2854; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
2855; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2856; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2857; VI-NEXT:    v_mov_b32_e32 v9, s1
2858; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
2859; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
2860; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
2861; VI-NEXT:    s_cmp_eq_u32 s7, 14
2862; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
2863; VI-NEXT:    v_mov_b32_e32 v12, s6
2864; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2865; VI-NEXT:    s_cmp_eq_u32 s7, 15
2866; VI-NEXT:    s_waitcnt vmcnt(1)
2867; VI-NEXT:    v_cndmask_b32_e32 v13, v3, v12, vcc
2868; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2869; VI-NEXT:    s_cmp_eq_u32 s7, 12
2870; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2871; VI-NEXT:    s_cmp_eq_u32 s7, 13
2872; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
2873; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v12, s[0:1]
2874; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2875; VI-NEXT:    s_cmp_eq_u32 s7, 10
2876; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2877; VI-NEXT:    s_cmp_eq_u32 s7, 11
2878; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
2879; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
2880; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2881; VI-NEXT:    s_cmp_eq_u32 s7, 8
2882; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2883; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
2884; VI-NEXT:    v_cndmask_b32_e64 v15, v15, v12, s[2:3]
2885; VI-NEXT:    s_cmp_eq_u32 s7, 9
2886; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
2887; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
2888; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2889; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2890; VI-NEXT:    s_cmp_eq_u32 s7, 6
2891; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2892; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v12, vcc
2893; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2894; VI-NEXT:    s_cmp_eq_u32 s7, 7
2895; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[4:5]
2896; VI-NEXT:    s_waitcnt vmcnt(0)
2897; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
2898; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2899; VI-NEXT:    v_cndmask_b32_e64 v14, v14, v12, s[0:1]
2900; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2901; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
2902; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2903; VI-NEXT:    s_cmp_eq_u32 s7, 4
2904; VI-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2905; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
2906; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2907; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v12, vcc
2908; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2909; VI-NEXT:    s_cmp_eq_u32 s7, 5
2910; VI-NEXT:    v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2911; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
2912; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
2913; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2914; VI-NEXT:    s_cmp_eq_u32 s7, 2
2915; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
2916; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2917; VI-NEXT:    s_cmp_eq_u32 s7, 3
2918; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
2919; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
2920; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2921; VI-NEXT:    s_cmp_eq_u32 s7, 0
2922; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
2923; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v12, vcc
2924; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2925; VI-NEXT:    s_cmp_eq_u32 s7, 1
2926; VI-NEXT:    v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2927; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
2928; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
2929; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2930; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
2931; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2932; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
2933; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
2934; VI-NEXT:    v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2935; VI-NEXT:    v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2936; VI-NEXT:    v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2937; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
2938; VI-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
2939; VI-NEXT:    s_endpgm
2940;
2941; CI-LABEL: v_insertelement_v16f16_dynamic:
2942; CI:       ; %bb.0:
2943; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2944; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
2945; CI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
2946; CI-NEXT:    s_waitcnt lgkmcnt(0)
2947; CI-NEXT:    v_mov_b32_e32 v1, s3
2948; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2949; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2950; CI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
2951; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2952; CI-NEXT:    flat_load_dwordx4 v[7:10], v[2:3]
2953; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2954; CI-NEXT:    v_mov_b32_e32 v5, s1
2955; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
2956; CI-NEXT:    v_cvt_f32_f16_e32 v6, s4
2957; CI-NEXT:    s_cmp_eq_u32 s5, 15
2958; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
2959; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2960; CI-NEXT:    s_cmp_eq_u32 s5, 14
2961; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2962; CI-NEXT:    s_cmp_eq_u32 s5, 13
2963; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2964; CI-NEXT:    s_cmp_eq_u32 s5, 12
2965; CI-NEXT:    s_waitcnt vmcnt(1)
2966; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
2967; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
2968; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
2969; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
2970; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
2971; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
2972; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
2973; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
2974; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
2975; CI-NEXT:    v_cndmask_b32_e64 v10, v10, v6, s[0:1]
2976; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2977; CI-NEXT:    s_cmp_eq_u32 s5, 11
2978; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v6, vcc
2979; CI-NEXT:    v_cndmask_b32_e64 v12, v12, v6, s[2:3]
2980; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2981; CI-NEXT:    s_cmp_eq_u32 s5, 10
2982; CI-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
2983; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v6, vcc
2984; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
2985; CI-NEXT:    s_cselect_b64 vcc, -1, 0
2986; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2987; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
2988; CI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
2989; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2990; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
2991; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
2992; CI-NEXT:    v_or_b32_e32 v9, v9, v12
2993; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
2994; CI-NEXT:    v_or_b32_e32 v8, v8, v12
2995; CI-NEXT:    v_cvt_f32_f16_e32 v12, v14
2996; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2997; CI-NEXT:    s_waitcnt vmcnt(0)
2998; CI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
2999; CI-NEXT:    v_cvt_f32_f16_e32 v13, v15
3000; CI-NEXT:    s_cmp_eq_u32 s5, 9
3001; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3002; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
3003; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3004; CI-NEXT:    s_cmp_eq_u32 s5, 8
3005; CI-NEXT:    v_cvt_f32_f16_e32 v14, v16
3006; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v6, vcc
3007; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3008; CI-NEXT:    s_cmp_eq_u32 s5, 7
3009; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3010; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
3011; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3012; CI-NEXT:    s_cmp_eq_u32 s5, 6
3013; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v6, vcc
3014; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3015; CI-NEXT:    s_cmp_eq_u32 s5, 5
3016; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3017; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3018; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
3019; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3020; CI-NEXT:    s_cmp_eq_u32 s5, 4
3021; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
3022; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3023; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3024; CI-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
3025; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3026; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3027; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3028; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
3029; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3030; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
3031; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
3032; CI-NEXT:    v_or_b32_e32 v10, v10, v11
3033; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
3034; CI-NEXT:    v_or_b32_e32 v7, v7, v12
3035; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
3036; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3037; CI-NEXT:    v_or_b32_e32 v3, v3, v12
3038; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
3039; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3040; CI-NEXT:    v_or_b32_e32 v2, v2, v12
3041; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
3042; CI-NEXT:    s_cmp_eq_u32 s5, 3
3043; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
3044; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3045; CI-NEXT:    s_cmp_eq_u32 s5, 2
3046; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
3047; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v6, vcc
3048; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3049; CI-NEXT:    s_cmp_eq_u32 s5, 1
3050; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
3051; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3052; CI-NEXT:    s_cmp_eq_u32 s5, 0
3053; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3054; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v6, vcc
3055; CI-NEXT:    s_cselect_b64 vcc, -1, 0
3056; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3057; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3058; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
3059; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
3060; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
3061; CI-NEXT:    v_or_b32_e32 v1, v1, v6
3062; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
3063; CI-NEXT:    v_or_b32_e32 v0, v0, v6
3064; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3065; CI-NEXT:    s_nop 0
3066; CI-NEXT:    v_add_i32_e32 v0, vcc, 16, v4
3067; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
3068; CI-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
3069; CI-NEXT:    s_endpgm
3070;
3071; GFX11-LABEL: v_insertelement_v16f16_dynamic:
3072; GFX11:       ; %bb.0:
3073; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
3074; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3075; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
3076; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3077; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
3078; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3079; GFX11-NEXT:    s_clause 0x1
3080; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
3081; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[2:3] offset:16
3082; GFX11-NEXT:    s_cmp_eq_u32 s5, 6
3083; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3084; GFX11-NEXT:    s_cmp_eq_u32 s5, 7
3085; GFX11-NEXT:    s_waitcnt vmcnt(1)
3086; GFX11-NEXT:    v_cndmask_b32_e64 v9, v3, s4, s2
3087; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3088; GFX11-NEXT:    s_cmp_eq_u32 s5, 4
3089; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
3090; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
3091; GFX11-NEXT:    s_cmp_eq_u32 s5, 5
3092; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
3093; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s3
3094; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
3095; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
3096; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
3097; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s4, s2
3098; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3099; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
3100; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s2
3101; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3102; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
3103; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
3104; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, s4, s2
3105; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3106; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
3107; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s2
3108; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3109; GFX11-NEXT:    s_cmp_eq_u32 s5, 14
3110; GFX11-NEXT:    s_waitcnt vmcnt(0)
3111; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v7
3112; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, s4, s3
3113; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
3114; GFX11-NEXT:    v_cndmask_b32_e64 v9, v12, s4, s2
3115; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3116; GFX11-NEXT:    s_cmp_eq_u32 s5, 15
3117; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s4, s2
3118; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3119; GFX11-NEXT:    s_cmp_eq_u32 s5, 12
3120; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
3121; GFX11-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
3122; GFX11-NEXT:    v_cndmask_b32_e64 v10, v13, s4, s2
3123; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3124; GFX11-NEXT:    s_cmp_eq_u32 s5, 13
3125; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s4, s2
3126; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3127; GFX11-NEXT:    s_cmp_eq_u32 s5, 10
3128; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
3129; GFX11-NEXT:    v_cndmask_b32_e64 v12, v14, s4, s2
3130; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3131; GFX11-NEXT:    s_cmp_eq_u32 s5, 11
3132; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s4, s2
3133; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3134; GFX11-NEXT:    s_cmp_eq_u32 s5, 8
3135; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
3136; GFX11-NEXT:    v_cndmask_b32_e64 v13, v15, s4, s2
3137; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3138; GFX11-NEXT:    s_cmp_eq_u32 s5, 9
3139; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s4, s2
3140; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
3141; GFX11-NEXT:    v_perm_b32 v7, v10, v7, 0x5040100
3142; GFX11-NEXT:    v_cndmask_b32_e64 v14, v16, s4, s2
3143; GFX11-NEXT:    v_perm_b32 v6, v12, v6, 0x5040100
3144; GFX11-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
3145; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
3146; GFX11-NEXT:    v_perm_b32 v0, v9, v0, 0x5040100
3147; GFX11-NEXT:    v_perm_b32 v4, v14, v4, 0x5040100
3148; GFX11-NEXT:    s_clause 0x1
3149; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
3150; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
3151; GFX11-NEXT:    s_endpgm
3152  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
3153  %tid.ext = sext i32 %tid to i64
3154  %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
3155  %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
3156  %vec = load <16 x half>, ptr addrspace(1) %in.gep
3157  %val.trunc = trunc i32 %val to i16
3158  %val.cvt = bitcast i16 %val.trunc to half
3159  %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n
3160  store <16 x half> %vecins, ptr addrspace(1) %out.gep
3161  ret void
3162}
3163
3164
3165declare i32 @llvm.amdgcn.workitem.id.x() #1
3166
3167attributes #0 = { nounwind }
3168attributes #1 = { nounwind readnone }
3169