xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
5; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) {
9; GFX9-LABEL: insertelement_s_v2i16_s_s:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
12; GFX9-NEXT:    s_and_b32 s1, s5, 1
13; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
14; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
15; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
16; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
17; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
19; GFX9-NEXT:    s_or_b32 s0, s0, s2
20; GFX9-NEXT:    v_mov_b32_e32 v0, 0
21; GFX9-NEXT:    v_mov_b32_e32 v1, 0
22; GFX9-NEXT:    v_mov_b32_e32 v2, s0
23; GFX9-NEXT:    global_store_dword v[0:1], v2, off
24; GFX9-NEXT:    s_endpgm
25;
26; GFX8-LABEL: insertelement_s_v2i16_s_s:
27; GFX8:       ; %bb.0:
28; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
29; GFX8-NEXT:    s_and_b32 s1, s5, 1
30; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
31; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
32; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
33; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
34; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
36; GFX8-NEXT:    s_or_b32 s0, s0, s2
37; GFX8-NEXT:    v_mov_b32_e32 v0, 0
38; GFX8-NEXT:    v_mov_b32_e32 v1, 0
39; GFX8-NEXT:    v_mov_b32_e32 v2, s0
40; GFX8-NEXT:    flat_store_dword v[0:1], v2
41; GFX8-NEXT:    s_endpgm
42;
43; GFX7-LABEL: insertelement_s_v2i16_s_s:
44; GFX7:       ; %bb.0:
45; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
46; GFX7-NEXT:    s_and_b32 s1, s5, 1
47; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
48; GFX7-NEXT:    s_and_b32 s2, s4, 0xffff
49; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
50; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
51; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
53; GFX7-NEXT:    s_or_b32 s2, s0, s2
54; GFX7-NEXT:    s_mov_b64 s[0:1], 0
55; GFX7-NEXT:    v_mov_b32_e32 v0, s2
56; GFX7-NEXT:    s_mov_b32 s2, -1
57; GFX7-NEXT:    s_mov_b32 s3, 0xf000
58; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
59; GFX7-NEXT:    s_endpgm
60;
61; GFX10-LABEL: insertelement_s_v2i16_s_s:
62; GFX10:       ; %bb.0:
63; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
64; GFX10-NEXT:    s_and_b32 s1, s5, 1
65; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
66; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
67; GFX10-NEXT:    v_mov_b32_e32 v0, 0
68; GFX10-NEXT:    s_lshl_b32 s3, 0xffff, s1
69; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
70; GFX10-NEXT:    v_mov_b32_e32 v1, 0
71; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX10-NEXT:    s_andn2_b32 s0, s0, s3
73; GFX10-NEXT:    s_or_b32 s0, s0, s1
74; GFX10-NEXT:    v_mov_b32_e32 v2, s0
75; GFX10-NEXT:    global_store_dword v[0:1], v2, off
76; GFX10-NEXT:    s_endpgm
77;
78; GFX11-LABEL: insertelement_s_v2i16_s_s:
79; GFX11:       ; %bb.0:
80; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
81; GFX11-NEXT:    s_and_b32 s1, s5, 1
82; GFX11-NEXT:    s_and_b32 s2, s4, 0xffff
83; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
84; GFX11-NEXT:    v_mov_b32_e32 v0, 0
85; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s1
86; GFX11-NEXT:    s_lshl_b32 s1, s2, s1
87; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX11-NEXT:    s_and_not1_b32 s0, s0, s3
89; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
90; GFX11-NEXT:    s_or_b32 s0, s0, s1
91; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
92; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
93; GFX11-NEXT:    s_endpgm
94  %vec = load <2 x i16>, ptr addrspace(4) %ptr
95  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
96  store <2 x i16> %insert, ptr addrspace(1) null
97  ret void
98}
99
100define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) {
101; GFX9-LABEL: insertelement_v_v2i16_s_s:
102; GFX9:       ; %bb.0:
103; GFX9-NEXT:    global_load_dword v2, v[0:1], off
104; GFX9-NEXT:    s_and_b32 s0, s3, 1
105; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
106; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
107; GFX9-NEXT:    s_lshl_b32 s1, s1, s0
108; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
109; GFX9-NEXT:    s_not_b32 s0, s0
110; GFX9-NEXT:    v_mov_b32_e32 v3, s1
111; GFX9-NEXT:    v_mov_b32_e32 v0, 0
112; GFX9-NEXT:    v_mov_b32_e32 v1, 0
113; GFX9-NEXT:    s_waitcnt vmcnt(0)
114; GFX9-NEXT:    v_and_or_b32 v2, v2, s0, v3
115; GFX9-NEXT:    global_store_dword v[0:1], v2, off
116; GFX9-NEXT:    s_endpgm
117;
118; GFX8-LABEL: insertelement_v_v2i16_s_s:
119; GFX8:       ; %bb.0:
120; GFX8-NEXT:    flat_load_dword v0, v[0:1]
121; GFX8-NEXT:    s_and_b32 s0, s3, 1
122; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
123; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
124; GFX8-NEXT:    s_lshl_b32 s1, s1, s0
125; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
126; GFX8-NEXT:    s_not_b32 s0, s0
127; GFX8-NEXT:    s_waitcnt vmcnt(0)
128; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
129; GFX8-NEXT:    v_mov_b32_e32 v0, 0
130; GFX8-NEXT:    v_mov_b32_e32 v1, 0
131; GFX8-NEXT:    v_or_b32_e32 v2, s1, v2
132; GFX8-NEXT:    flat_store_dword v[0:1], v2
133; GFX8-NEXT:    s_endpgm
134;
135; GFX7-LABEL: insertelement_v_v2i16_s_s:
136; GFX7:       ; %bb.0:
137; GFX7-NEXT:    s_mov_b32 s6, 0
138; GFX7-NEXT:    s_mov_b32 s7, 0xf000
139; GFX7-NEXT:    s_mov_b64 s[4:5], 0
140; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
141; GFX7-NEXT:    s_and_b32 s0, s3, 1
142; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
143; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
144; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
145; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
146; GFX7-NEXT:    s_not_b32 s0, s0
147; GFX7-NEXT:    s_mov_b64 s[4:5], 0
148; GFX7-NEXT:    s_mov_b32 s6, -1
149; GFX7-NEXT:    s_waitcnt vmcnt(0)
150; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
151; GFX7-NEXT:    v_or_b32_e32 v0, s1, v0
152; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
153; GFX7-NEXT:    s_endpgm
154;
155; GFX10-LABEL: insertelement_v_v2i16_s_s:
156; GFX10:       ; %bb.0:
157; GFX10-NEXT:    global_load_dword v2, v[0:1], off
158; GFX10-NEXT:    s_and_b32 s0, s3, 1
159; GFX10-NEXT:    s_and_b32 s1, s2, 0xffff
160; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
161; GFX10-NEXT:    v_mov_b32_e32 v0, 0
162; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s0
163; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
164; GFX10-NEXT:    s_not_b32 s1, s2
165; GFX10-NEXT:    v_mov_b32_e32 v1, 0
166; GFX10-NEXT:    s_waitcnt vmcnt(0)
167; GFX10-NEXT:    v_and_or_b32 v2, v2, s1, s0
168; GFX10-NEXT:    global_store_dword v[0:1], v2, off
169; GFX10-NEXT:    s_endpgm
170;
171; GFX11-LABEL: insertelement_v_v2i16_s_s:
172; GFX11:       ; %bb.0:
173; GFX11-NEXT:    global_load_b32 v2, v[0:1], off
174; GFX11-NEXT:    s_and_b32 s0, s3, 1
175; GFX11-NEXT:    s_and_b32 s1, s2, 0xffff
176; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
177; GFX11-NEXT:    v_mov_b32_e32 v0, 0
178; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s0
179; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
180; GFX11-NEXT:    s_not_b32 s1, s2
181; GFX11-NEXT:    v_mov_b32_e32 v1, 0
182; GFX11-NEXT:    s_waitcnt vmcnt(0)
183; GFX11-NEXT:    v_and_or_b32 v2, v2, s1, s0
184; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
185; GFX11-NEXT:    s_endpgm
186  %vec = load <2 x i16>, ptr addrspace(1 ) %ptr
187  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
188  store <2 x i16> %insert, ptr addrspace(1) null
189  ret void
190}
191
192define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) {
193; GFX9-LABEL: insertelement_s_v2i16_v_s:
194; GFX9:       ; %bb.0:
195; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
196; GFX9-NEXT:    s_and_b32 s1, s4, 1
197; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
198; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s1
199; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v0
200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX9-NEXT:    s_andn2_b32 s0, s0, s2
202; GFX9-NEXT:    v_mov_b32_e32 v3, s0
203; GFX9-NEXT:    v_mov_b32_e32 v0, 0
204; GFX9-NEXT:    v_mov_b32_e32 v1, 0
205; GFX9-NEXT:    v_lshl_or_b32 v2, v2, s1, v3
206; GFX9-NEXT:    global_store_dword v[0:1], v2, off
207; GFX9-NEXT:    s_endpgm
208;
209; GFX8-LABEL: insertelement_s_v2i16_v_s:
210; GFX8:       ; %bb.0:
211; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
212; GFX8-NEXT:    s_and_b32 s1, s4, 1
213; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
214; GFX8-NEXT:    v_mov_b32_e32 v1, s1
215; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
216; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
217; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
219; GFX8-NEXT:    v_mov_b32_e32 v0, 0
220; GFX8-NEXT:    v_mov_b32_e32 v1, 0
221; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
222; GFX8-NEXT:    flat_store_dword v[0:1], v2
223; GFX8-NEXT:    s_endpgm
224;
225; GFX7-LABEL: insertelement_s_v2i16_v_s:
226; GFX7:       ; %bb.0:
227; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
228; GFX7-NEXT:    s_and_b32 s1, s4, 1
229; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
230; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
231; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
232; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
233; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
235; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
236; GFX7-NEXT:    s_mov_b64 s[0:1], 0
237; GFX7-NEXT:    s_mov_b32 s2, -1
238; GFX7-NEXT:    s_mov_b32 s3, 0xf000
239; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
240; GFX7-NEXT:    s_endpgm
241;
242; GFX10-LABEL: insertelement_s_v2i16_v_s:
243; GFX10:       ; %bb.0:
244; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
245; GFX10-NEXT:    s_and_b32 s1, s4, 1
246; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
247; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
248; GFX10-NEXT:    v_mov_b32_e32 v0, 0
249; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s1
250; GFX10-NEXT:    v_mov_b32_e32 v1, 0
251; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
253; GFX10-NEXT:    v_lshl_or_b32 v2, v2, s1, s0
254; GFX10-NEXT:    global_store_dword v[0:1], v2, off
255; GFX10-NEXT:    s_endpgm
256;
257; GFX11-LABEL: insertelement_s_v2i16_v_s:
258; GFX11:       ; %bb.0:
259; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
260; GFX11-NEXT:    s_and_b32 s1, s4, 1
261; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
262; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
263; GFX11-NEXT:    v_mov_b32_e32 v0, 0
264; GFX11-NEXT:    v_mov_b32_e32 v1, 0
265; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s1
266; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX11-NEXT:    s_and_not1_b32 s0, s0, s2
268; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
269; GFX11-NEXT:    v_lshl_or_b32 v2, v2, s1, s0
270; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
271; GFX11-NEXT:    s_endpgm
272  %vec = load <2 x i16>, ptr addrspace(4) %ptr
273  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
274  store <2 x i16> %insert, ptr addrspace(1) null
275  ret void
276}
277
278define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) {
279; GFX9-LABEL: insertelement_s_v2i16_s_v:
280; GFX9:       ; %bb.0:
281; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
282; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
283; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
284; GFX9-NEXT:    s_and_b32 s1, s4, 0xffff
285; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
286; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
287; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
288; GFX9-NEXT:    v_not_b32_e32 v3, v0
289; GFX9-NEXT:    v_mov_b32_e32 v0, 0
290; GFX9-NEXT:    v_mov_b32_e32 v1, 0
291; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
293; GFX9-NEXT:    global_store_dword v[0:1], v2, off
294; GFX9-NEXT:    s_endpgm
295;
296; GFX8-LABEL: insertelement_s_v2i16_s_v:
297; GFX8:       ; %bb.0:
298; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
299; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
300; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
301; GFX8-NEXT:    s_and_b32 s1, s4, 0xffff
302; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
303; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
304; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
305; GFX8-NEXT:    v_not_b32_e32 v0, v0
306; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
307; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
308; GFX8-NEXT:    v_mov_b32_e32 v0, 0
309; GFX8-NEXT:    v_mov_b32_e32 v1, 0
310; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
311; GFX8-NEXT:    flat_store_dword v[0:1], v2
312; GFX8-NEXT:    s_endpgm
313;
314; GFX7-LABEL: insertelement_s_v2i16_s_v:
315; GFX7:       ; %bb.0:
316; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
317; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
318; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
319; GFX7-NEXT:    s_and_b32 s1, s4, 0xffff
320; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
321; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
322; GFX7-NEXT:    v_not_b32_e32 v0, v0
323; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
325; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
326; GFX7-NEXT:    s_mov_b64 s[0:1], 0
327; GFX7-NEXT:    s_mov_b32 s2, -1
328; GFX7-NEXT:    s_mov_b32 s3, 0xf000
329; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
330; GFX7-NEXT:    s_endpgm
331;
332; GFX10-LABEL: insertelement_s_v2i16_s_v:
333; GFX10:       ; %bb.0:
334; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
335; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
336; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
337; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
338; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
339; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
340; GFX10-NEXT:    v_not_b32_e32 v3, v1
341; GFX10-NEXT:    v_mov_b32_e32 v0, 0
342; GFX10-NEXT:    v_mov_b32_e32 v1, 0
343; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX10-NEXT:    v_and_or_b32 v2, s0, v3, v2
345; GFX10-NEXT:    global_store_dword v[0:1], v2, off
346; GFX10-NEXT:    s_endpgm
347;
348; GFX11-LABEL: insertelement_s_v2i16_s_v:
349; GFX11:       ; %bb.0:
350; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
351; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
352; GFX11-NEXT:    s_and_b32 s1, s4, 0xffff
353; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
354; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
355; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
356; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
357; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
358; GFX11-NEXT:    v_not_b32_e32 v3, v1
359; GFX11-NEXT:    v_mov_b32_e32 v0, 0
360; GFX11-NEXT:    v_mov_b32_e32 v1, 0
361; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX11-NEXT:    v_and_or_b32 v2, s0, v3, v2
363; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
364; GFX11-NEXT:    s_endpgm
365  %vec = load <2 x i16>, ptr addrspace(4) %ptr
366  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
367  store <2 x i16> %insert, ptr addrspace(1) null
368  ret void
369}
370
371define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) {
372; GFX9-LABEL: insertelement_s_v2i16_v_v:
373; GFX9:       ; %bb.0:
374; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
375; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
376; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
377; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
378; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
379; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v1, v2
380; GFX9-NEXT:    v_not_b32_e32 v2, v0
381; GFX9-NEXT:    v_mov_b32_e32 v0, 0
382; GFX9-NEXT:    v_mov_b32_e32 v1, 0
383; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX9-NEXT:    v_and_or_b32 v2, s0, v2, v3
385; GFX9-NEXT:    global_store_dword v[0:1], v2, off
386; GFX9-NEXT:    s_endpgm
387;
388; GFX8-LABEL: insertelement_s_v2i16_v_v:
389; GFX8:       ; %bb.0:
390; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
391; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
392; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
393; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffff
394; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
395; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v1, v2
396; GFX8-NEXT:    v_not_b32_e32 v0, v0
397; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
398; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
399; GFX8-NEXT:    v_mov_b32_e32 v0, 0
400; GFX8-NEXT:    v_mov_b32_e32 v1, 0
401; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
402; GFX8-NEXT:    flat_store_dword v[0:1], v2
403; GFX8-NEXT:    s_endpgm
404;
405; GFX7-LABEL: insertelement_s_v2i16_v_v:
406; GFX7:       ; %bb.0:
407; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
408; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
409; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
410; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
411; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
412; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
413; GFX7-NEXT:    v_not_b32_e32 v1, v1
414; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
416; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
417; GFX7-NEXT:    s_mov_b64 s[0:1], 0
418; GFX7-NEXT:    s_mov_b32 s2, -1
419; GFX7-NEXT:    s_mov_b32 s3, 0xf000
420; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
421; GFX7-NEXT:    s_endpgm
422;
423; GFX10-LABEL: insertelement_s_v2i16_v_v:
424; GFX10:       ; %bb.0:
425; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
426; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
427; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
428; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
429; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
430; GFX10-NEXT:    v_mov_b32_e32 v0, 0
431; GFX10-NEXT:    v_mov_b32_e32 v1, 0
432; GFX10-NEXT:    v_not_b32_e32 v2, v2
433; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX10-NEXT:    v_and_or_b32 v2, s0, v2, v3
435; GFX10-NEXT:    global_store_dword v[0:1], v2, off
436; GFX10-NEXT:    s_endpgm
437;
438; GFX11-LABEL: insertelement_s_v2i16_v_v:
439; GFX11:       ; %bb.0:
440; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
441; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
442; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
443; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
444; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
445; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
446; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
447; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
448; GFX11-NEXT:    v_mov_b32_e32 v1, 0
449; GFX11-NEXT:    v_not_b32_e32 v2, v2
450; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
452; GFX11-NEXT:    v_and_or_b32 v2, s0, v2, v3
453; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
454; GFX11-NEXT:    s_endpgm
455  %vec = load <2 x i16>, ptr addrspace(4) %ptr
456  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
457  store <2 x i16> %insert, ptr addrspace(1) null
458  ret void
459}
460
461define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) {
462; GFX9-LABEL: insertelement_v_v2i16_s_v:
463; GFX9:       ; %bb.0:
464; GFX9-NEXT:    global_load_dword v3, v[0:1], off
465; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
466; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
467; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
468; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
469; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
470; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
471; GFX9-NEXT:    v_not_b32_e32 v4, v0
472; GFX9-NEXT:    v_mov_b32_e32 v0, 0
473; GFX9-NEXT:    v_mov_b32_e32 v1, 0
474; GFX9-NEXT:    s_waitcnt vmcnt(0)
475; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, v2
476; GFX9-NEXT:    global_store_dword v[0:1], v2, off
477; GFX9-NEXT:    s_endpgm
478;
479; GFX8-LABEL: insertelement_v_v2i16_s_v:
480; GFX8:       ; %bb.0:
481; GFX8-NEXT:    flat_load_dword v0, v[0:1]
482; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
483; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
484; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
485; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
486; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
487; GFX8-NEXT:    v_not_b32_e32 v1, v1
488; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
489; GFX8-NEXT:    s_waitcnt vmcnt(0)
490; GFX8-NEXT:    v_and_b32_e32 v2, v0, v1
491; GFX8-NEXT:    v_mov_b32_e32 v0, 0
492; GFX8-NEXT:    v_mov_b32_e32 v1, 0
493; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
494; GFX8-NEXT:    flat_store_dword v[0:1], v2
495; GFX8-NEXT:    s_endpgm
496;
497; GFX7-LABEL: insertelement_v_v2i16_s_v:
498; GFX7:       ; %bb.0:
499; GFX7-NEXT:    s_mov_b32 s6, 0
500; GFX7-NEXT:    s_mov_b32 s7, 0xf000
501; GFX7-NEXT:    s_mov_b64 s[4:5], 0
502; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
503; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
504; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
505; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
506; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
507; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
508; GFX7-NEXT:    v_not_b32_e32 v1, v1
509; GFX7-NEXT:    s_mov_b64 s[4:5], 0
510; GFX7-NEXT:    s_mov_b32 s6, -1
511; GFX7-NEXT:    s_waitcnt vmcnt(0)
512; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
513; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
514; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
515; GFX7-NEXT:    s_endpgm
516;
517; GFX10-LABEL: insertelement_v_v2i16_s_v:
518; GFX10:       ; %bb.0:
519; GFX10-NEXT:    global_load_dword v3, v[0:1], off
520; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
521; GFX10-NEXT:    s_and_b32 s0, s2, 0xffff
522; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
523; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
524; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
525; GFX10-NEXT:    v_not_b32_e32 v4, v1
526; GFX10-NEXT:    v_mov_b32_e32 v0, 0
527; GFX10-NEXT:    v_mov_b32_e32 v1, 0
528; GFX10-NEXT:    s_waitcnt vmcnt(0)
529; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, v2
530; GFX10-NEXT:    global_store_dword v[0:1], v2, off
531; GFX10-NEXT:    s_endpgm
532;
533; GFX11-LABEL: insertelement_v_v2i16_s_v:
534; GFX11:       ; %bb.0:
535; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
536; GFX11-NEXT:    v_and_b32_e32 v0, 1, v2
537; GFX11-NEXT:    s_and_b32 s0, s2, 0xffff
538; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
539; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
540; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
541; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
542; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
543; GFX11-NEXT:    v_not_b32_e32 v4, v1
544; GFX11-NEXT:    v_mov_b32_e32 v0, 0
545; GFX11-NEXT:    v_mov_b32_e32 v1, 0
546; GFX11-NEXT:    s_waitcnt vmcnt(0)
547; GFX11-NEXT:    v_and_or_b32 v2, v3, v4, v2
548; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
549; GFX11-NEXT:    s_endpgm
550  %vec = load <2 x i16>, ptr addrspace(1) %ptr
551  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
552  store <2 x i16> %insert, ptr addrspace(1) null
553  ret void
554}
555
556define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) {
557; GFX9-LABEL: insertelement_v_v2i16_v_s:
558; GFX9:       ; %bb.0:
559; GFX9-NEXT:    global_load_dword v3, v[0:1], off
560; GFX9-NEXT:    s_and_b32 s0, s2, 1
561; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
562; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
563; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
564; GFX9-NEXT:    s_not_b32 s0, s0
565; GFX9-NEXT:    v_mov_b32_e32 v0, 0
566; GFX9-NEXT:    v_mov_b32_e32 v1, 0
567; GFX9-NEXT:    s_waitcnt vmcnt(0)
568; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
569; GFX9-NEXT:    global_store_dword v[0:1], v2, off
570; GFX9-NEXT:    s_endpgm
571;
572; GFX8-LABEL: insertelement_v_v2i16_v_s:
573; GFX8:       ; %bb.0:
574; GFX8-NEXT:    flat_load_dword v0, v[0:1]
575; GFX8-NEXT:    s_and_b32 s0, s2, 1
576; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
577; GFX8-NEXT:    v_mov_b32_e32 v1, s0
578; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
579; GFX8-NEXT:    s_not_b32 s0, s0
580; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
581; GFX8-NEXT:    s_waitcnt vmcnt(0)
582; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
583; GFX8-NEXT:    v_mov_b32_e32 v0, 0
584; GFX8-NEXT:    v_mov_b32_e32 v1, 0
585; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
586; GFX8-NEXT:    flat_store_dword v[0:1], v2
587; GFX8-NEXT:    s_endpgm
588;
589; GFX7-LABEL: insertelement_v_v2i16_v_s:
590; GFX7:       ; %bb.0:
591; GFX7-NEXT:    s_mov_b32 s6, 0
592; GFX7-NEXT:    s_mov_b32 s7, 0xf000
593; GFX7-NEXT:    s_mov_b64 s[4:5], 0
594; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
595; GFX7-NEXT:    s_and_b32 s0, s2, 1
596; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
597; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
598; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
599; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
600; GFX7-NEXT:    s_not_b32 s0, s0
601; GFX7-NEXT:    s_mov_b64 s[4:5], 0
602; GFX7-NEXT:    s_mov_b32 s6, -1
603; GFX7-NEXT:    s_waitcnt vmcnt(0)
604; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
605; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
606; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
607; GFX7-NEXT:    s_endpgm
608;
609; GFX10-LABEL: insertelement_v_v2i16_v_s:
610; GFX10:       ; %bb.0:
611; GFX10-NEXT:    global_load_dword v3, v[0:1], off
612; GFX10-NEXT:    s_and_b32 s0, s2, 1
613; GFX10-NEXT:    v_mov_b32_e32 v0, 0
614; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
615; GFX10-NEXT:    v_mov_b32_e32 v1, 0
616; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
617; GFX10-NEXT:    s_lshl_b32 s0, 0xffff, s0
618; GFX10-NEXT:    s_not_b32 s0, s0
619; GFX10-NEXT:    s_waitcnt vmcnt(0)
620; GFX10-NEXT:    v_and_or_b32 v2, v3, s0, v2
621; GFX10-NEXT:    global_store_dword v[0:1], v2, off
622; GFX10-NEXT:    s_endpgm
623;
624; GFX11-LABEL: insertelement_v_v2i16_v_s:
625; GFX11:       ; %bb.0:
626; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
627; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
628; GFX11-NEXT:    s_and_b32 s0, s2, 1
629; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
630; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
631; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
632; GFX11-NEXT:    v_lshlrev_b32_e32 v2, s0, v0
633; GFX11-NEXT:    s_lshl_b32 s0, 0xffff, s0
634; GFX11-NEXT:    v_mov_b32_e32 v0, 0
635; GFX11-NEXT:    v_mov_b32_e32 v1, 0
636; GFX11-NEXT:    s_not_b32 s0, s0
637; GFX11-NEXT:    s_waitcnt vmcnt(0)
638; GFX11-NEXT:    v_and_or_b32 v2, v3, s0, v2
639; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
640; GFX11-NEXT:    s_endpgm
641  %vec = load <2 x i16>, ptr addrspace(1) %ptr
642  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
643  store <2 x i16> %insert, ptr addrspace(1) null
644  ret void
645}
646
647define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) {
648; GFX9-LABEL: insertelement_v_v2i16_v_v:
649; GFX9:       ; %bb.0:
650; GFX9-NEXT:    global_load_dword v4, v[0:1], off
651; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
652; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
653; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
654; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
655; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
656; GFX9-NEXT:    v_not_b32_e32 v3, v0
657; GFX9-NEXT:    v_mov_b32_e32 v0, 0
658; GFX9-NEXT:    v_mov_b32_e32 v1, 0
659; GFX9-NEXT:    s_waitcnt vmcnt(0)
660; GFX9-NEXT:    v_and_or_b32 v2, v4, v3, v2
661; GFX9-NEXT:    global_store_dword v[0:1], v2, off
662; GFX9-NEXT:    s_endpgm
663;
664; GFX8-LABEL: insertelement_v_v2i16_v_v:
665; GFX8:       ; %bb.0:
666; GFX8-NEXT:    flat_load_dword v0, v[0:1]
667; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
668; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
669; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
670; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
671; GFX8-NEXT:    v_not_b32_e32 v1, v1
672; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
673; GFX8-NEXT:    s_waitcnt vmcnt(0)
674; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
675; GFX8-NEXT:    v_mov_b32_e32 v0, 0
676; GFX8-NEXT:    v_mov_b32_e32 v1, 0
677; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
678; GFX8-NEXT:    flat_store_dword v[0:1], v2
679; GFX8-NEXT:    s_endpgm
680;
681; GFX7-LABEL: insertelement_v_v2i16_v_v:
682; GFX7:       ; %bb.0:
683; GFX7-NEXT:    s_mov_b32 s2, 0
684; GFX7-NEXT:    s_mov_b32 s3, 0xf000
685; GFX7-NEXT:    s_mov_b64 s[0:1], 0
686; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
687; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
688; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
689; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
690; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
691; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
692; GFX7-NEXT:    v_not_b32_e32 v1, v1
693; GFX7-NEXT:    s_mov_b64 s[0:1], 0
694; GFX7-NEXT:    s_mov_b32 s2, -1
695; GFX7-NEXT:    s_waitcnt vmcnt(0)
696; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
697; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
698; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
699; GFX7-NEXT:    s_endpgm
700;
701; GFX10-LABEL: insertelement_v_v2i16_v_v:
702; GFX10:       ; %bb.0:
703; GFX10-NEXT:    global_load_dword v4, v[0:1], off
704; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
705; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
706; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
707; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
708; GFX10-NEXT:    v_not_b32_e32 v3, v1
709; GFX10-NEXT:    v_mov_b32_e32 v0, 0
710; GFX10-NEXT:    v_mov_b32_e32 v1, 0
711; GFX10-NEXT:    s_waitcnt vmcnt(0)
712; GFX10-NEXT:    v_and_or_b32 v2, v4, v3, v2
713; GFX10-NEXT:    global_store_dword v[0:1], v2, off
714; GFX10-NEXT:    s_endpgm
715;
716; GFX11-LABEL: insertelement_v_v2i16_v_v:
717; GFX11:       ; %bb.0:
718; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
719; GFX11-NEXT:    v_and_b32_e32 v0, 1, v3
720; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
721; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
722; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
723; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
724; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
725; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
726; GFX11-NEXT:    v_mov_b32_e32 v1, 0
727; GFX11-NEXT:    v_not_b32_e32 v2, v2
728; GFX11-NEXT:    s_waitcnt vmcnt(0)
729; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
730; GFX11-NEXT:    v_and_or_b32 v2, v4, v2, v3
731; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
732; GFX11-NEXT:    s_endpgm
733  %vec = load <2 x i16>, ptr addrspace(1) %ptr
734  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
735  store <2 x i16> %insert, ptr addrspace(1) null
736  ret void
737}
738
739; FIXME: 3 element load/store legalization
740; define amdgpu_ps void @insertelement_s_v3i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) {
741;   %vec = load <3 x i16>, ptr addrspace(4) %ptr
742;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
743;   store <3 x i16> %insert, ptr addrspace(1) null
744;   ret void
745; }
746
747; define amdgpu_ps void @insertelement_v_v3i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) {
748;   %vec = load <3 x i16>, ptr addrspace(1 ) %ptr
749;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
750;   store <3 x i16> %insert, ptr addrspace(1) null
751;   ret void
752; }
753
754; define amdgpu_ps void @insertelement_s_v3i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) {
755;   %vec = load <3 x i16>, ptr addrspace(4) %ptr
756;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
757;   store <3 x i16> %insert, ptr addrspace(1) null
758;   ret void
759; }
760
761; define amdgpu_ps void @insertelement_s_v3i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) {
762;   %vec = load <3 x i16>, ptr addrspace(4) %ptr
763;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
764;   store <3 x i16> %insert, ptr addrspace(1) null
765;   ret void
766; }
767
768; define amdgpu_ps void @insertelement_s_v3i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) {
769;   %vec = load <3 x i16>, ptr addrspace(4) %ptr
770;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
771;   store <3 x i16> %insert, ptr addrspace(1) null
772;   ret void
773; }
774
775; define amdgpu_ps void @insertelement_v_v3i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) {
776;   %vec = load <3 x i16>, ptr addrspace(1) %ptr
777;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
778;   store <3 x i16> %insert, ptr addrspace(1) null
779;   ret void
780; }
781
782; define amdgpu_ps void @insertelement_v_v3i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) {
783;   %vec = load <3 x i16>, ptr addrspace(1) %ptr
784;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
785;   store <3 x i16> %insert, ptr addrspace(1) null
786;   ret void
787; }
788
789; define amdgpu_ps void @insertelement_v_v3i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) {
790;   %vec = load <3 x i16>, ptr addrspace(1) %ptr
791;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
792;   store <3 x i16> %insert, ptr addrspace(1) null
793;   ret void
794; }
795
796define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) {
797; GFX9-LABEL: insertelement_v_v4i16_s_s:
798; GFX9:       ; %bb.0:
799; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
800; GFX9-NEXT:    s_and_b32 s1, s3, 1
801; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
802; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
803; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
804; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
805; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
806; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
807; GFX9-NEXT:    s_not_b32 s1, s1
808; GFX9-NEXT:    v_mov_b32_e32 v4, s2
809; GFX9-NEXT:    v_mov_b32_e32 v2, 0
810; GFX9-NEXT:    v_mov_b32_e32 v3, 0
811; GFX9-NEXT:    s_waitcnt vmcnt(0)
812; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
813; GFX9-NEXT:    v_and_or_b32 v4, v5, s1, v4
814; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
815; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
816; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
817; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
818; GFX9-NEXT:    s_endpgm
819;
820; GFX8-LABEL: insertelement_v_v4i16_s_s:
821; GFX8:       ; %bb.0:
822; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
823; GFX8-NEXT:    s_and_b32 s1, s3, 1
824; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
825; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
826; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
827; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
828; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
829; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
830; GFX8-NEXT:    s_not_b32 s1, s1
831; GFX8-NEXT:    v_mov_b32_e32 v2, 0
832; GFX8-NEXT:    v_mov_b32_e32 v3, 0
833; GFX8-NEXT:    s_waitcnt vmcnt(0)
834; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
835; GFX8-NEXT:    v_and_b32_e32 v4, s1, v4
836; GFX8-NEXT:    v_or_b32_e32 v4, s2, v4
837; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
838; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
839; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
840; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
841; GFX8-NEXT:    s_endpgm
842;
843; GFX7-LABEL: insertelement_v_v4i16_s_s:
844; GFX7:       ; %bb.0:
845; GFX7-NEXT:    s_mov_b32 s6, 0
846; GFX7-NEXT:    s_mov_b32 s7, 0xf000
847; GFX7-NEXT:    s_mov_b64 s[4:5], 0
848; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
849; GFX7-NEXT:    s_and_b32 s1, s3, 1
850; GFX7-NEXT:    s_lshr_b32 s0, s3, 1
851; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
852; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
853; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
854; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
855; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
856; GFX7-NEXT:    s_not_b32 s1, s1
857; GFX7-NEXT:    s_mov_b64 s[4:5], 0
858; GFX7-NEXT:    s_mov_b32 s6, -1
859; GFX7-NEXT:    s_waitcnt vmcnt(0)
860; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
861; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
862; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
863; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
864; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
865; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
866; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
867; GFX7-NEXT:    s_endpgm
868;
869; GFX10-LABEL: insertelement_v_v4i16_s_s:
870; GFX10:       ; %bb.0:
871; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
872; GFX10-NEXT:    s_lshr_b32 s0, s3, 1
873; GFX10-NEXT:    s_and_b32 s1, s3, 1
874; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
875; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
876; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
877; GFX10-NEXT:    s_lshl_b32 s3, 0xffff, s1
878; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
879; GFX10-NEXT:    s_not_b32 s2, s3
880; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
881; GFX10-NEXT:    s_waitcnt vmcnt(0)
882; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
883; GFX10-NEXT:    v_and_or_b32 v4, v2, s2, s1
884; GFX10-NEXT:    v_mov_b32_e32 v2, 0
885; GFX10-NEXT:    v_mov_b32_e32 v3, 0
886; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
887; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
888; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
889; GFX10-NEXT:    s_endpgm
890;
891; GFX11-LABEL: insertelement_v_v4i16_s_s:
892; GFX11:       ; %bb.0:
893; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
894; GFX11-NEXT:    s_lshr_b32 s0, s3, 1
895; GFX11-NEXT:    s_and_b32 s1, s3, 1
896; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
897; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
898; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
899; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s1
900; GFX11-NEXT:    s_lshl_b32 s1, s2, s1
901; GFX11-NEXT:    s_not_b32 s2, s3
902; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
903; GFX11-NEXT:    s_waitcnt vmcnt(0)
904; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
905; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
906; GFX11-NEXT:    v_and_or_b32 v4, v2, s2, s1
907; GFX11-NEXT:    v_mov_b32_e32 v2, 0
908; GFX11-NEXT:    v_mov_b32_e32 v3, 0
909; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
910; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
911; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
912; GFX11-NEXT:    s_endpgm
913  %vec = load <4 x i16>, ptr addrspace(1 ) %ptr
914  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
915  store <4 x i16> %insert, ptr addrspace(1) null
916  ret void
917}
918
919define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) {
920; GFX9-LABEL: insertelement_s_v4i16_v_s:
921; GFX9:       ; %bb.0:
922; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
923; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
924; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
925; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
926; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
927; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
929; GFX9-NEXT:    s_and_b32 s4, s4, 1
930; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
931; GFX9-NEXT:    s_lshl_b32 s5, 0xffff, s4
932; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
933; GFX9-NEXT:    v_mov_b32_e32 v1, s3
934; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
935; GFX9-NEXT:    v_mov_b32_e32 v0, s0
936; GFX9-NEXT:    v_mov_b32_e32 v1, s1
937; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
938; GFX9-NEXT:    v_mov_b32_e32 v2, 0
939; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
940; GFX9-NEXT:    v_mov_b32_e32 v3, 0
941; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
942; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
943; GFX9-NEXT:    s_endpgm
944;
945; GFX8-LABEL: insertelement_s_v4i16_v_s:
946; GFX8:       ; %bb.0:
947; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
948; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
949; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
950; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
951; GFX8-NEXT:    v_mov_b32_e32 v2, 0
952; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
953; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
954; GFX8-NEXT:    s_and_b32 s4, s4, 1
955; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
956; GFX8-NEXT:    v_mov_b32_e32 v1, s4
957; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s4
958; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
959; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
960; GFX8-NEXT:    v_or_b32_e32 v4, s3, v0
961; GFX8-NEXT:    v_mov_b32_e32 v0, s0
962; GFX8-NEXT:    v_mov_b32_e32 v1, s1
963; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
964; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
965; GFX8-NEXT:    v_mov_b32_e32 v3, 0
966; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
967; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
968; GFX8-NEXT:    s_endpgm
969;
970; GFX7-LABEL: insertelement_s_v4i16_v_s:
971; GFX7:       ; %bb.0:
972; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
973; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
974; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
975; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
976; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
977; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
979; GFX7-NEXT:    s_and_b32 s4, s4, 1
980; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
981; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
982; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s4
983; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
984; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
985; GFX7-NEXT:    v_mov_b32_e32 v0, s0
986; GFX7-NEXT:    v_mov_b32_e32 v1, s1
987; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
988; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
989; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
990; GFX7-NEXT:    s_mov_b64 s[0:1], 0
991; GFX7-NEXT:    s_mov_b32 s2, -1
992; GFX7-NEXT:    s_mov_b32 s3, 0xf000
993; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
994; GFX7-NEXT:    s_endpgm
995;
996; GFX10-LABEL: insertelement_s_v4i16_v_s:
997; GFX10:       ; %bb.0:
998; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
999; GFX10-NEXT:    s_lshr_b32 s2, s4, 1
1000; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1001; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
1002; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
1003; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
1005; GFX10-NEXT:    s_and_b32 s4, s4, 1
1006; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1007; GFX10-NEXT:    s_lshl_b32 s4, s4, 4
1008; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1009; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
1010; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
1011; GFX10-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
1012; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1013; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1014; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
1015; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
1016; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1017; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1018; GFX10-NEXT:    s_endpgm
1019;
1020; GFX11-LABEL: insertelement_s_v4i16_v_s:
1021; GFX11:       ; %bb.0:
1022; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
1023; GFX11-NEXT:    s_lshr_b32 s2, s4, 1
1024; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1025; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
1026; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
1027; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX11-NEXT:    s_cselect_b32 s3, s1, s0
1029; GFX11-NEXT:    s_and_b32 s4, s4, 1
1030; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1031; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
1032; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1033; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
1034; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1035; GFX11-NEXT:    s_and_not1_b32 s3, s3, s5
1036; GFX11-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
1037; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1038; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1039; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4
1040; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
1041; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1042; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1043; GFX11-NEXT:    s_endpgm
1044  %vec = load <4 x i16>, ptr addrspace(4) %ptr
1045  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1046  store <4 x i16> %insert, ptr addrspace(1) null
1047  ret void
1048}
1049
1050define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) {
1051; GFX9-LABEL: insertelement_s_v4i16_s_v:
1052; GFX9:       ; %bb.0:
1053; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1054; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
1055; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1056; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1057; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
1058; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1060; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1061; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
1062; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
1063; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1064; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
1065; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v4
1066; GFX9-NEXT:    v_not_b32_e32 v0, v0
1067; GFX9-NEXT:    v_and_or_b32 v4, v1, v0, v3
1068; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1069; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1070; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1071; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1072; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1073; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1074; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1075; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1076; GFX9-NEXT:    s_endpgm
1077;
1078; GFX8-LABEL: insertelement_s_v4i16_s_v:
1079; GFX8:       ; %bb.0:
1080; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1081; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
1082; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
1083; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1084; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
1085; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1087; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1088; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
1089; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffff
1090; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1091; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
1092; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v4
1093; GFX8-NEXT:    v_not_b32_e32 v0, v0
1094; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
1095; GFX8-NEXT:    v_or_b32_e32 v4, v0, v3
1096; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1097; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1098; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1099; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1100; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1101; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1102; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1103; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1104; GFX8-NEXT:    s_endpgm
1105;
1106; GFX7-LABEL: insertelement_s_v4i16_s_v:
1107; GFX7:       ; %bb.0:
1108; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1109; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
1110; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
1111; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1112; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
1113; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX7-NEXT:    v_mov_b32_e32 v1, s0
1115; GFX7-NEXT:    v_mov_b32_e32 v3, s1
1116; GFX7-NEXT:    s_and_b32 s2, s4, 0xffff
1117; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1118; GFX7-NEXT:    v_lshl_b32_e32 v3, s2, v0
1119; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
1120; GFX7-NEXT:    v_not_b32_e32 v0, v0
1121; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
1122; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
1123; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1124; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1125; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1126; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1127; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1128; GFX7-NEXT:    s_mov_b64 s[0:1], 0
1129; GFX7-NEXT:    s_mov_b32 s2, -1
1130; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1131; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1132; GFX7-NEXT:    s_endpgm
1133;
1134; GFX10-LABEL: insertelement_s_v4i16_s_v:
1135; GFX10:       ; %bb.0:
1136; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1137; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
1138; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
1139; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
1140; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1141; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
1142; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
1143; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s2
1144; GFX10-NEXT:    v_not_b32_e32 v2, v2
1145; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1146; GFX10-NEXT:    v_mov_b32_e32 v0, s1
1147; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v0, vcc_lo
1148; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1149; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1150; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
1151; GFX10-NEXT:    v_and_or_b32 v5, v5, v2, v3
1152; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1153; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1154; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
1155; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
1156; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1157; GFX10-NEXT:    s_endpgm
1158;
1159; GFX11-LABEL: insertelement_s_v4i16_s_v:
1160; GFX11:       ; %bb.0:
1161; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
1162; GFX11-NEXT:    v_and_b32_e32 v1, 1, v0
1163; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
1164; GFX11-NEXT:    s_and_b32 s2, s4, 0xffff
1165; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1166; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
1167; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1168; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1
1169; GFX11-NEXT:    v_cndmask_b32_e32 v5, s0, v0, vcc_lo
1170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
1171; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
1172; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v1, s2
1173; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1174; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
1175; GFX11-NEXT:    v_not_b32_e32 v2, v2
1176; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1177; GFX11-NEXT:    v_and_or_b32 v5, v5, v2, v3
1178; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1179; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1180; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
1181; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
1182; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1183; GFX11-NEXT:    s_endpgm
1184  %vec = load <4 x i16>, ptr addrspace(4) %ptr
1185  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1186  store <4 x i16> %insert, ptr addrspace(1) null
1187  ret void
1188}
1189
1190define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) {
1191; GFX9-LABEL: insertelement_s_v4i16_v_v:
1192; GFX9:       ; %bb.0:
1193; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1194; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
1195; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1196; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
1197; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1198; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1199; GFX9-NEXT:    v_mov_b32_e32 v3, s0
1200; GFX9-NEXT:    v_mov_b32_e32 v4, s1
1201; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1202; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
1203; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1204; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v4
1205; GFX9-NEXT:    v_not_b32_e32 v1, v1
1206; GFX9-NEXT:    v_and_or_b32 v4, v3, v1, v0
1207; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1208; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1209; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1210; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1211; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1212; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1213; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1214; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1215; GFX9-NEXT:    s_endpgm
1216;
1217; GFX8-LABEL: insertelement_s_v4i16_v_v:
1218; GFX8:       ; %bb.0:
1219; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1220; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
1221; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1222; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
1223; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1224; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1225; GFX8-NEXT:    v_mov_b32_e32 v3, s0
1226; GFX8-NEXT:    v_mov_b32_e32 v4, s1
1227; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1228; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffff
1229; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1230; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v4
1231; GFX8-NEXT:    v_not_b32_e32 v1, v1
1232; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
1233; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
1234; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1235; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1236; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1237; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1238; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1239; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1240; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1241; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1242; GFX8-NEXT:    s_endpgm
1243;
1244; GFX7-LABEL: insertelement_s_v4i16_v_v:
1245; GFX7:       ; %bb.0:
1246; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1247; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
1248; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
1249; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1250; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1251; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX7-NEXT:    v_mov_b32_e32 v3, s0
1253; GFX7-NEXT:    v_mov_b32_e32 v4, s1
1254; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1255; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
1256; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
1257; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1258; GFX7-NEXT:    v_not_b32_e32 v1, v1
1259; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
1260; GFX7-NEXT:    v_or_b32_e32 v3, v1, v0
1261; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1262; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1264; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1265; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1266; GFX7-NEXT:    s_mov_b64 s[0:1], 0
1267; GFX7-NEXT:    s_mov_b32 s2, -1
1268; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1269; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1270; GFX7-NEXT:    s_endpgm
1271;
1272; GFX10-LABEL: insertelement_s_v4i16_v_v:
1273; GFX10:       ; %bb.0:
1274; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1275; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
1276; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
1277; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1278; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
1279; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, 0xffff
1280; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1281; GFX10-NEXT:    v_not_b32_e32 v3, v3
1282; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1284; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v1, vcc_lo
1285; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1286; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1287; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
1288; GFX10-NEXT:    v_and_or_b32 v5, v5, v3, v2
1289; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1290; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1291; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
1292; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
1293; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1294; GFX10-NEXT:    s_endpgm
1295;
1296; GFX11-LABEL: insertelement_s_v4i16_v_v:
1297; GFX11:       ; %bb.0:
1298; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
1299; GFX11-NEXT:    v_and_b32_e32 v2, 1, v1
1300; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
1301; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1302; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1303; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
1304; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1305; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 4, v2
1306; GFX11-NEXT:    v_cndmask_b32_e32 v5, s0, v1, vcc_lo
1307; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
1308; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v2, 0xffff
1309; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v0
1310; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1311; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
1312; GFX11-NEXT:    v_not_b32_e32 v3, v3
1313; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1314; GFX11-NEXT:    v_and_or_b32 v5, v5, v3, v2
1315; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1316; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1317; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
1318; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
1319; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1320; GFX11-NEXT:    s_endpgm
1321  %vec = load <4 x i16>, ptr addrspace(4) %ptr
1322  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1323  store <4 x i16> %insert, ptr addrspace(1) null
1324  ret void
1325}
1326
1327define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) {
1328; GFX9-LABEL: insertelement_v_v4i16_s_v:
1329; GFX9:       ; %bb.0:
1330; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1331; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 1, v2
1332; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
1333; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
1334; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
1335; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1336; GFX9-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
1337; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
1338; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
1339; GFX9-NEXT:    v_not_b32_e32 v2, v2
1340; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1341; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
1342; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1343; GFX9-NEXT:    s_waitcnt vmcnt(0)
1344; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1345; GFX9-NEXT:    v_and_or_b32 v2, v5, v2, v7
1346; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1347; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1348; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
1349; GFX9-NEXT:    s_endpgm
1350;
1351; GFX8-LABEL: insertelement_v_v4i16_s_v:
1352; GFX8:       ; %bb.0:
1353; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1354; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 1, v2
1355; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
1356; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff
1357; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
1358; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1359; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
1360; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
1361; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
1362; GFX8-NEXT:    v_not_b32_e32 v2, v2
1363; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1364; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
1365; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1366; GFX8-NEXT:    s_waitcnt vmcnt(0)
1367; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1368; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
1369; GFX8-NEXT:    v_or_b32_e32 v2, v2, v7
1370; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1371; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1372; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
1373; GFX8-NEXT:    s_endpgm
1374;
1375; GFX7-LABEL: insertelement_v_v4i16_s_v:
1376; GFX7:       ; %bb.0:
1377; GFX7-NEXT:    s_mov_b32 s6, 0
1378; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1379; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1380; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1381; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
1382; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
1383; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
1384; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1385; GFX7-NEXT:    v_lshl_b32_e32 v4, s0, v2
1386; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
1387; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
1388; GFX7-NEXT:    v_not_b32_e32 v2, v2
1389; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
1390; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1391; GFX7-NEXT:    s_mov_b32 s6, -1
1392; GFX7-NEXT:    s_waitcnt vmcnt(0)
1393; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1394; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
1395; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
1396; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1397; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1398; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1399; GFX7-NEXT:    s_endpgm
1400;
1401; GFX10-LABEL: insertelement_v_v4i16_s_v:
1402; GFX10:       ; %bb.0:
1403; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1404; GFX10-NEXT:    v_and_b32_e32 v3, 1, v2
1405; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
1406; GFX10-NEXT:    s_and_b32 s0, s2, 0xffff
1407; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1408; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
1409; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
1410; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v3, s0
1411; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v5
1412; GFX10-NEXT:    v_not_b32_e32 v3, v4
1413; GFX10-NEXT:    s_waitcnt vmcnt(0)
1414; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1415; GFX10-NEXT:    v_and_or_b32 v4, v4, v3, v2
1416; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1417; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1418; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1419; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1420; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1421; GFX10-NEXT:    s_endpgm
1422;
1423; GFX11-LABEL: insertelement_v_v4i16_s_v:
1424; GFX11:       ; %bb.0:
1425; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1426; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
1427; GFX11-NEXT:    s_and_b32 s0, s2, 0xffff
1428; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1429; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
1430; GFX11-NEXT:    v_and_b32_e32 v3, 1, v2
1431; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1432; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1433; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
1434; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v3, s0
1435; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v5
1436; GFX11-NEXT:    v_not_b32_e32 v3, v4
1437; GFX11-NEXT:    s_waitcnt vmcnt(0)
1438; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1439; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1440; GFX11-NEXT:    v_and_or_b32 v4, v4, v3, v2
1441; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1442; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1443; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1444; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1445; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1446; GFX11-NEXT:    s_endpgm
1447  %vec = load <4 x i16>, ptr addrspace(1) %ptr
1448  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1449  store <4 x i16> %insert, ptr addrspace(1) null
1450  ret void
1451}
1452
1453define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) {
1454; GFX9-LABEL: insertelement_v_v4i16_v_s:
1455; GFX9:       ; %bb.0:
1456; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1457; GFX9-NEXT:    s_and_b32 s1, s2, 1
1458; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
1459; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
1460; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1461; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
1462; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
1463; GFX9-NEXT:    s_not_b32 s1, s1
1464; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1465; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1466; GFX9-NEXT:    s_waitcnt vmcnt(0)
1467; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1468; GFX9-NEXT:    v_and_or_b32 v2, v5, s1, v2
1469; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
1470; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1471; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1472; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
1473; GFX9-NEXT:    s_endpgm
1474;
1475; GFX8-LABEL: insertelement_v_v4i16_v_s:
1476; GFX8:       ; %bb.0:
1477; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1478; GFX8-NEXT:    s_and_b32 s1, s2, 1
1479; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
1480; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
1481; GFX8-NEXT:    v_mov_b32_e32 v5, s1
1482; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
1483; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
1484; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1485; GFX8-NEXT:    s_not_b32 s1, s1
1486; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1487; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1488; GFX8-NEXT:    s_waitcnt vmcnt(0)
1489; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1490; GFX8-NEXT:    v_and_b32_e32 v5, s1, v5
1491; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
1492; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
1493; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1494; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1495; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
1496; GFX8-NEXT:    s_endpgm
1497;
1498; GFX7-LABEL: insertelement_v_v4i16_v_s:
1499; GFX7:       ; %bb.0:
1500; GFX7-NEXT:    s_mov_b32 s6, 0
1501; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1502; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1503; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1504; GFX7-NEXT:    s_and_b32 s1, s2, 1
1505; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
1506; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1507; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
1508; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v2
1509; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
1510; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
1511; GFX7-NEXT:    s_not_b32 s1, s1
1512; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1513; GFX7-NEXT:    s_mov_b32 s6, -1
1514; GFX7-NEXT:    s_waitcnt vmcnt(0)
1515; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1516; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
1517; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
1518; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
1519; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1520; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1521; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1522; GFX7-NEXT:    s_endpgm
1523;
1524; GFX10-LABEL: insertelement_v_v4i16_v_s:
1525; GFX10:       ; %bb.0:
1526; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1527; GFX10-NEXT:    s_lshr_b32 s1, s2, 1
1528; GFX10-NEXT:    s_and_b32 s0, s2, 1
1529; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s1, 1
1530; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
1531; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1532; GFX10-NEXT:    s_lshl_b32 s0, 0xffff, s0
1533; GFX10-NEXT:    s_not_b32 s0, s0
1534; GFX10-NEXT:    s_waitcnt vmcnt(0)
1535; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc_lo
1536; GFX10-NEXT:    v_and_or_b32 v4, v3, s0, v2
1537; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
1538; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1539; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1540; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1541; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1542; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1543; GFX10-NEXT:    s_endpgm
1544;
1545; GFX11-LABEL: insertelement_v_v4i16_v_s:
1546; GFX11:       ; %bb.0:
1547; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1548; GFX11-NEXT:    s_lshr_b32 s1, s2, 1
1549; GFX11-NEXT:    s_and_b32 s0, s2, 1
1550; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s1, 1
1551; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1552; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
1553; GFX11-NEXT:    s_waitcnt vmcnt(0)
1554; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1555; GFX11-NEXT:    v_dual_cndmask_b32 v3, v0, v1 :: v_dual_lshlrev_b32 v2, s0, v2
1556; GFX11-NEXT:    s_lshl_b32 s0, 0xffff, s0
1557; GFX11-NEXT:    s_not_b32 s0, s0
1558; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1559; GFX11-NEXT:    v_and_or_b32 v4, v3, s0, v2
1560; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
1561; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1562; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1564; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1565; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1566; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1567; GFX11-NEXT:    s_endpgm
1568  %vec = load <4 x i16>, ptr addrspace(1) %ptr
1569  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1570  store <4 x i16> %insert, ptr addrspace(1) null
1571  ret void
1572}
1573
1574define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) {
1575; GFX9-LABEL: insertelement_v_v4i16_v_v:
1576; GFX9:       ; %bb.0:
1577; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1578; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 1, v3
1579; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
1580; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
1581; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1582; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1583; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
1584; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
1585; GFX9-NEXT:    v_not_b32_e32 v3, v3
1586; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1587; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
1588; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1589; GFX9-NEXT:    s_waitcnt vmcnt(0)
1590; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
1591; GFX9-NEXT:    v_and_or_b32 v2, v6, v3, v2
1592; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1593; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1594; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
1595; GFX9-NEXT:    s_endpgm
1596;
1597; GFX8-LABEL: insertelement_v_v4i16_v_v:
1598; GFX8:       ; %bb.0:
1599; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1600; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 1, v3
1601; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
1602; GFX8-NEXT:    v_mov_b32_e32 v6, 0xffff
1603; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1604; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1605; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
1606; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
1607; GFX8-NEXT:    v_not_b32_e32 v3, v3
1608; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1609; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
1610; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1611; GFX8-NEXT:    s_waitcnt vmcnt(0)
1612; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
1613; GFX8-NEXT:    v_and_b32_e32 v3, v6, v3
1614; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
1615; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1616; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1617; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1618; GFX8-NEXT:    s_endpgm
1619;
1620; GFX7-LABEL: insertelement_v_v4i16_v_v:
1621; GFX7:       ; %bb.0:
1622; GFX7-NEXT:    s_mov_b32 s6, 0
1623; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1624; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1625; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1626; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
1627; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
1628; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1629; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1630; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
1631; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v3
1632; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
1633; GFX7-NEXT:    v_not_b32_e32 v3, v3
1634; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
1635; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1636; GFX7-NEXT:    s_mov_b32 s6, -1
1637; GFX7-NEXT:    s_waitcnt vmcnt(0)
1638; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1639; GFX7-NEXT:    v_and_b32_e32 v3, v5, v3
1640; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
1641; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1642; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1643; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1644; GFX7-NEXT:    s_endpgm
1645;
1646; GFX10-LABEL: insertelement_v_v4i16_v_v:
1647; GFX10:       ; %bb.0:
1648; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1649; GFX10-NEXT:    v_and_b32_e32 v4, 1, v3
1650; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
1651; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1652; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
1653; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v6
1654; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
1655; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1656; GFX10-NEXT:    v_not_b32_e32 v3, v5
1657; GFX10-NEXT:    s_waitcnt vmcnt(0)
1658; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1659; GFX10-NEXT:    v_and_or_b32 v4, v4, v3, v2
1660; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1661; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1662; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1663; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1664; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1665; GFX10-NEXT:    s_endpgm
1666;
1667; GFX11-LABEL: insertelement_v_v4i16_v_v:
1668; GFX11:       ; %bb.0:
1669; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1670; GFX11-NEXT:    v_and_b32_e32 v4, 1, v3
1671; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
1672; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1673; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1674; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
1675; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1676; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v6
1677; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
1678; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
1679; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
1680; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1681; GFX11-NEXT:    v_not_b32_e32 v3, v5
1682; GFX11-NEXT:    s_waitcnt vmcnt(0)
1683; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1684; GFX11-NEXT:    v_and_or_b32 v4, v4, v3, v2
1685; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1686; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1687; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1688; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1689; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1690; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1691; GFX11-NEXT:    s_endpgm
1692  %vec = load <4 x i16>, ptr addrspace(1) %ptr
1693  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1694  store <4 x i16> %insert, ptr addrspace(1) null
1695  ret void
1696}
1697
1698define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) {
1699; GFX9-LABEL: insertelement_s_v8i16_s_s:
1700; GFX9:       ; %bb.0:
1701; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1702; GFX9-NEXT:    s_lshr_b32 s6, s5, 1
1703; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
1704; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1705; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1706; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1707; GFX9-NEXT:    s_cselect_b32 s7, s1, s0
1708; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
1709; GFX9-NEXT:    s_cselect_b32 s7, s2, s7
1710; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
1711; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
1712; GFX9-NEXT:    s_and_b32 s5, s5, 1
1713; GFX9-NEXT:    s_lshl_b32 s5, s5, 4
1714; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
1715; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
1716; GFX9-NEXT:    s_lshl_b32 s5, 0xffff, s5
1717; GFX9-NEXT:    s_andn2_b32 s5, s7, s5
1718; GFX9-NEXT:    s_or_b32 s4, s5, s4
1719; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
1720; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
1721; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
1722; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
1723; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
1724; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
1725; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
1726; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
1727; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1728; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1729; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1730; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1731; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1732; GFX9-NEXT:    s_endpgm
1733;
1734; GFX8-LABEL: insertelement_s_v8i16_s_s:
1735; GFX8:       ; %bb.0:
1736; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1737; GFX8-NEXT:    s_lshr_b32 s6, s5, 1
1738; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
1739; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1740; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1741; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1742; GFX8-NEXT:    s_cselect_b32 s7, s1, s0
1743; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
1744; GFX8-NEXT:    s_cselect_b32 s7, s2, s7
1745; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
1746; GFX8-NEXT:    s_cselect_b32 s7, s3, s7
1747; GFX8-NEXT:    s_and_b32 s5, s5, 1
1748; GFX8-NEXT:    s_lshl_b32 s5, s5, 4
1749; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
1750; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
1751; GFX8-NEXT:    s_lshl_b32 s5, 0xffff, s5
1752; GFX8-NEXT:    s_andn2_b32 s5, s7, s5
1753; GFX8-NEXT:    s_or_b32 s4, s5, s4
1754; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
1755; GFX8-NEXT:    s_cselect_b32 s0, s4, s0
1756; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
1757; GFX8-NEXT:    s_cselect_b32 s1, s4, s1
1758; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
1759; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
1760; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
1761; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
1762; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1763; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1764; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1765; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1766; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1767; GFX8-NEXT:    s_endpgm
1768;
1769; GFX7-LABEL: insertelement_s_v8i16_s_s:
1770; GFX7:       ; %bb.0:
1771; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1772; GFX7-NEXT:    s_lshr_b32 s6, s5, 1
1773; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
1774; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1775; GFX7-NEXT:    s_cselect_b32 s7, s1, s0
1776; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
1777; GFX7-NEXT:    s_cselect_b32 s7, s2, s7
1778; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
1779; GFX7-NEXT:    s_cselect_b32 s7, s3, s7
1780; GFX7-NEXT:    s_and_b32 s5, s5, 1
1781; GFX7-NEXT:    s_lshl_b32 s5, s5, 4
1782; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
1783; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
1784; GFX7-NEXT:    s_lshl_b32 s5, 0xffff, s5
1785; GFX7-NEXT:    s_andn2_b32 s5, s7, s5
1786; GFX7-NEXT:    s_or_b32 s4, s5, s4
1787; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
1788; GFX7-NEXT:    s_cselect_b32 s0, s4, s0
1789; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
1790; GFX7-NEXT:    s_cselect_b32 s1, s4, s1
1791; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
1792; GFX7-NEXT:    s_cselect_b32 s2, s4, s2
1793; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
1794; GFX7-NEXT:    s_cselect_b32 s3, s4, s3
1795; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1796; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1797; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1798; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1799; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1800; GFX7-NEXT:    s_mov_b32 s6, -1
1801; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1802; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1803; GFX7-NEXT:    s_endpgm
1804;
1805; GFX10-LABEL: insertelement_s_v8i16_s_s:
1806; GFX10:       ; %bb.0:
1807; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1808; GFX10-NEXT:    s_lshr_b32 s6, s5, 1
1809; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1810; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
1811; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1812; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1813; GFX10-NEXT:    s_cselect_b32 s7, s1, s0
1814; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
1815; GFX10-NEXT:    s_cselect_b32 s7, s2, s7
1816; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
1817; GFX10-NEXT:    s_cselect_b32 s7, s3, s7
1818; GFX10-NEXT:    s_and_b32 s5, s5, 1
1819; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
1820; GFX10-NEXT:    s_lshl_b32 s5, s5, 4
1821; GFX10-NEXT:    s_lshl_b32 s8, 0xffff, s5
1822; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
1823; GFX10-NEXT:    s_andn2_b32 s5, s7, s8
1824; GFX10-NEXT:    s_or_b32 s4, s5, s4
1825; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
1826; GFX10-NEXT:    s_cselect_b32 s0, s4, s0
1827; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
1828; GFX10-NEXT:    s_cselect_b32 s1, s4, s1
1829; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
1830; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
1831; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
1832; GFX10-NEXT:    s_cselect_b32 s3, s4, s3
1833; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1834; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1835; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1836; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1837; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1838; GFX10-NEXT:    s_endpgm
1839;
1840; GFX11-LABEL: insertelement_s_v8i16_s_s:
1841; GFX11:       ; %bb.0:
1842; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
1843; GFX11-NEXT:    s_lshr_b32 s6, s5, 1
1844; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1845; GFX11-NEXT:    s_cmp_eq_u32 s6, 1
1846; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1847; GFX11-NEXT:    s_cselect_b32 s7, s1, s0
1848; GFX11-NEXT:    s_cmp_eq_u32 s6, 2
1849; GFX11-NEXT:    s_cselect_b32 s7, s2, s7
1850; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
1851; GFX11-NEXT:    s_cselect_b32 s7, s3, s7
1852; GFX11-NEXT:    s_and_b32 s5, s5, 1
1853; GFX11-NEXT:    s_and_b32 s4, s4, 0xffff
1854; GFX11-NEXT:    s_lshl_b32 s5, s5, 4
1855; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1856; GFX11-NEXT:    s_lshl_b32 s8, 0xffff, s5
1857; GFX11-NEXT:    s_lshl_b32 s4, s4, s5
1858; GFX11-NEXT:    s_and_not1_b32 s5, s7, s8
1859; GFX11-NEXT:    s_or_b32 s4, s5, s4
1860; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
1861; GFX11-NEXT:    s_cselect_b32 s0, s4, s0
1862; GFX11-NEXT:    s_cmp_eq_u32 s6, 1
1863; GFX11-NEXT:    s_cselect_b32 s1, s4, s1
1864; GFX11-NEXT:    s_cmp_eq_u32 s6, 2
1865; GFX11-NEXT:    s_cselect_b32 s2, s4, s2
1866; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
1867; GFX11-NEXT:    s_cselect_b32 s3, s4, s3
1868; GFX11-NEXT:    v_mov_b32_e32 v4, 0
1869; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0
1870; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
1871; GFX11-NEXT:    v_mov_b32_e32 v3, s3
1872; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
1873; GFX11-NEXT:    s_endpgm
1874  %vec = load <8 x i16>, ptr addrspace(4) %ptr
1875  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
1876  store <8 x i16> %insert, ptr addrspace(1) null
1877  ret void
1878}
1879
1880define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) {
1881; GFX9-LABEL: insertelement_v_v8i16_s_s:
1882; GFX9:       ; %bb.0:
1883; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
1884; GFX9-NEXT:    s_and_b32 s0, s3, 1
1885; GFX9-NEXT:    s_lshr_b32 s4, s3, 1
1886; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
1887; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
1888; GFX9-NEXT:    s_lshl_b32 s1, s1, s0
1889; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
1890; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1891; GFX9-NEXT:    s_not_b32 s5, s0
1892; GFX9-NEXT:    v_mov_b32_e32 v6, s1
1893; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
1894; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
1895; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1896; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1897; GFX9-NEXT:    s_waitcnt vmcnt(0)
1898; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1899; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v2, s[0:1]
1900; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[2:3]
1901; GFX9-NEXT:    v_and_or_b32 v6, v7, s5, v6
1902; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
1903; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
1904; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1905; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1906; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
1907; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1908; GFX9-NEXT:    s_endpgm
1909;
1910; GFX8-LABEL: insertelement_v_v8i16_s_s:
1911; GFX8:       ; %bb.0:
1912; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1913; GFX8-NEXT:    s_and_b32 s0, s3, 1
1914; GFX8-NEXT:    s_lshr_b32 s4, s3, 1
1915; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
1916; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
1917; GFX8-NEXT:    s_lshl_b32 s5, s1, s0
1918; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
1919; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1920; GFX8-NEXT:    s_not_b32 s6, s0
1921; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
1922; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
1923; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1924; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1925; GFX8-NEXT:    s_waitcnt vmcnt(0)
1926; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
1927; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
1928; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[2:3]
1929; GFX8-NEXT:    v_and_b32_e32 v6, s6, v6
1930; GFX8-NEXT:    v_or_b32_e32 v6, s5, v6
1931; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
1932; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
1933; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1934; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1935; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
1936; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1937; GFX8-NEXT:    s_endpgm
1938;
1939; GFX7-LABEL: insertelement_v_v8i16_s_s:
1940; GFX7:       ; %bb.0:
1941; GFX7-NEXT:    s_mov_b32 s10, 0
1942; GFX7-NEXT:    s_mov_b32 s11, 0xf000
1943; GFX7-NEXT:    s_mov_b64 s[8:9], 0
1944; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
1945; GFX7-NEXT:    s_and_b32 s0, s3, 1
1946; GFX7-NEXT:    s_lshr_b32 s4, s3, 1
1947; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
1948; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
1949; GFX7-NEXT:    s_lshl_b32 s5, s1, s0
1950; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
1951; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1952; GFX7-NEXT:    s_not_b32 s6, s0
1953; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
1954; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
1955; GFX7-NEXT:    s_mov_b64 s[8:9], 0
1956; GFX7-NEXT:    s_mov_b32 s10, -1
1957; GFX7-NEXT:    s_waitcnt vmcnt(0)
1958; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
1959; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
1960; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
1961; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
1962; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
1963; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
1964; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
1965; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1966; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1967; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
1968; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1969; GFX7-NEXT:    s_endpgm
1970;
1971; GFX10-LABEL: insertelement_v_v8i16_s_s:
1972; GFX10:       ; %bb.0:
1973; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
1974; GFX10-NEXT:    s_lshr_b32 s4, s3, 1
1975; GFX10-NEXT:    s_and_b32 s1, s3, 1
1976; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
1977; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
1978; GFX10-NEXT:    s_lshl_b32 s3, s1, 4
1979; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s4, 3
1980; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
1981; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s3
1982; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
1983; GFX10-NEXT:    s_not_b32 s3, s5
1984; GFX10-NEXT:    s_waitcnt vmcnt(0)
1985; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1986; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
1987; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s1
1988; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, s2
1989; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
1990; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1991; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1992; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
1993; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
1994; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
1995; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
1996; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1997; GFX10-NEXT:    s_endpgm
1998;
1999; GFX11-LABEL: insertelement_v_v8i16_s_s:
2000; GFX11:       ; %bb.0:
2001; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
2002; GFX11-NEXT:    s_lshr_b32 s4, s3, 1
2003; GFX11-NEXT:    s_and_b32 s1, s3, 1
2004; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
2005; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
2006; GFX11-NEXT:    s_lshl_b32 s3, s1, 4
2007; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, s4, 3
2008; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
2009; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s3
2010; GFX11-NEXT:    s_lshl_b32 s2, s2, s3
2011; GFX11-NEXT:    s_not_b32 s3, s5
2012; GFX11-NEXT:    s_waitcnt vmcnt(0)
2013; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2014; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2015; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
2016; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s1
2017; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2018; GFX11-NEXT:    v_and_or_b32 v6, v4, s3, s2
2019; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
2020; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2021; GFX11-NEXT:    v_mov_b32_e32 v5, 0
2022; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
2023; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
2024; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
2025; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
2026; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
2027; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2028; GFX11-NEXT:    s_endpgm
2029  %vec = load <8 x i16>, ptr addrspace(1 ) %ptr
2030  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2031  store <8 x i16> %insert, ptr addrspace(1) null
2032  ret void
2033}
2034
2035define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) {
2036; GFX9-LABEL: insertelement_s_v8i16_v_s:
2037; GFX9:       ; %bb.0:
2038; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
2039; GFX9-NEXT:    s_lshr_b32 s5, s4, 1
2040; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
2041; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2042; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
2043; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2044; GFX9-NEXT:    s_cselect_b32 s6, s1, s0
2045; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
2046; GFX9-NEXT:    s_cselect_b32 s6, s2, s6
2047; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
2048; GFX9-NEXT:    s_cselect_b32 s6, s3, s6
2049; GFX9-NEXT:    s_and_b32 s4, s4, 1
2050; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
2051; GFX9-NEXT:    s_lshl_b32 s7, 0xffff, s4
2052; GFX9-NEXT:    s_andn2_b32 s6, s6, s7
2053; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2054; GFX9-NEXT:    v_lshl_or_b32 v6, v0, s4, v1
2055; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2056; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2057; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2058; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
2059; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2060; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2061; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
2062; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2063; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2064; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2065; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
2066; GFX9-NEXT:    v_mov_b32_e32 v5, 0
2067; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2068; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2069; GFX9-NEXT:    s_endpgm
2070;
2071; GFX8-LABEL: insertelement_s_v8i16_v_s:
2072; GFX8:       ; %bb.0:
2073; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
2074; GFX8-NEXT:    s_lshr_b32 s5, s4, 1
2075; GFX8-NEXT:    s_cmp_eq_u32 s5, 1
2076; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
2077; GFX8-NEXT:    v_mov_b32_e32 v4, 0
2078; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX8-NEXT:    s_cselect_b32 s6, s1, s0
2080; GFX8-NEXT:    s_cmp_eq_u32 s5, 2
2081; GFX8-NEXT:    s_cselect_b32 s6, s2, s6
2082; GFX8-NEXT:    s_cmp_eq_u32 s5, 3
2083; GFX8-NEXT:    s_cselect_b32 s6, s3, s6
2084; GFX8-NEXT:    s_and_b32 s4, s4, 1
2085; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
2086; GFX8-NEXT:    v_mov_b32_e32 v1, s4
2087; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s4
2088; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2089; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
2090; GFX8-NEXT:    v_or_b32_e32 v6, s4, v0
2091; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2092; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2093; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2094; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
2095; GFX8-NEXT:    v_mov_b32_e32 v2, s2
2096; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2097; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
2098; GFX8-NEXT:    v_mov_b32_e32 v3, s3
2099; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2100; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
2101; GFX8-NEXT:    v_mov_b32_e32 v5, 0
2102; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2103; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2104; GFX8-NEXT:    s_endpgm
2105;
2106; GFX7-LABEL: insertelement_s_v8i16_v_s:
2107; GFX7:       ; %bb.0:
2108; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
2109; GFX7-NEXT:    s_lshr_b32 s5, s4, 1
2110; GFX7-NEXT:    s_cmp_eq_u32 s5, 1
2111; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2112; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
2113; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2114; GFX7-NEXT:    s_cselect_b32 s6, s1, s0
2115; GFX7-NEXT:    s_cmp_eq_u32 s5, 2
2116; GFX7-NEXT:    s_cselect_b32 s6, s2, s6
2117; GFX7-NEXT:    s_cmp_eq_u32 s5, 3
2118; GFX7-NEXT:    s_cselect_b32 s6, s3, s6
2119; GFX7-NEXT:    s_and_b32 s4, s4, 1
2120; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
2121; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
2122; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s4
2123; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
2124; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
2125; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2126; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2127; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2128; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
2129; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2130; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2131; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
2132; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2133; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2134; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
2135; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2136; GFX7-NEXT:    s_mov_b64 s[0:1], 0
2137; GFX7-NEXT:    s_mov_b32 s2, -1
2138; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2139; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2140; GFX7-NEXT:    s_endpgm
2141;
2142; GFX10-LABEL: insertelement_s_v8i16_v_s:
2143; GFX10:       ; %bb.0:
2144; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
2145; GFX10-NEXT:    s_lshr_b32 s5, s4, 1
2146; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
2147; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
2148; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
2149; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2150; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
2151; GFX10-NEXT:    s_cmp_eq_u32 s5, 2
2152; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2153; GFX10-NEXT:    s_cselect_b32 s6, s2, s6
2154; GFX10-NEXT:    s_cmp_eq_u32 s5, 3
2155; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2156; GFX10-NEXT:    s_cselect_b32 s6, s3, s6
2157; GFX10-NEXT:    s_and_b32 s4, s4, 1
2158; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2159; GFX10-NEXT:    s_lshl_b32 s4, s4, 4
2160; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2161; GFX10-NEXT:    s_lshl_b32 s7, 0xffff, s4
2162; GFX10-NEXT:    s_andn2_b32 s6, s6, s7
2163; GFX10-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
2164; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2165; GFX10-NEXT:    v_mov_b32_e32 v5, 0
2166; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
2167; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
2168; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
2169; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 2
2170; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
2171; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
2172; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
2173; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2174; GFX10-NEXT:    s_endpgm
2175;
2176; GFX11-LABEL: insertelement_s_v8i16_v_s:
2177; GFX11:       ; %bb.0:
2178; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
2179; GFX11-NEXT:    s_lshr_b32 s5, s4, 1
2180; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
2181; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
2182; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
2183; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2184; GFX11-NEXT:    s_cselect_b32 s6, s1, s0
2185; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
2186; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
2187; GFX11-NEXT:    s_cselect_b32 s6, s2, s6
2188; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
2189; GFX11-NEXT:    v_mov_b32_e32 v1, s1
2190; GFX11-NEXT:    s_cselect_b32 s6, s3, s6
2191; GFX11-NEXT:    s_and_b32 s4, s4, 1
2192; GFX11-NEXT:    v_mov_b32_e32 v2, s2
2193; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
2194; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2195; GFX11-NEXT:    s_lshl_b32 s7, 0xffff, s4
2196; GFX11-NEXT:    s_and_not1_b32 s6, s6, s7
2197; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2198; GFX11-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
2199; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
2200; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
2201; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2202; GFX11-NEXT:    v_mov_b32_e32 v5, 0
2203; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
2204; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 2
2205; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
2206; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
2207; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
2208; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2209; GFX11-NEXT:    s_endpgm
2210  %vec = load <8 x i16>, ptr addrspace(4) %ptr
2211  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2212  store <8 x i16> %insert, ptr addrspace(1) null
2213  ret void
2214}
2215
2216define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) {
2217; GFX9-LABEL: insertelement_s_v8i16_s_v:
2218; GFX9:       ; %bb.0:
2219; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
2220; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
2221; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2222; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2223; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
2224; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2225; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2226; GFX9-NEXT:    v_mov_b32_e32 v2, s9
2227; GFX9-NEXT:    v_mov_b32_e32 v3, s10
2228; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2229; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
2230; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2231; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
2232; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
2233; GFX9-NEXT:    v_mov_b32_e32 v5, s11
2234; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2235; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
2236; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
2237; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
2238; GFX9-NEXT:    v_not_b32_e32 v0, v0
2239; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
2240; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2241; GFX9-NEXT:    v_mov_b32_e32 v1, s9
2242; GFX9-NEXT:    v_mov_b32_e32 v2, s10
2243; GFX9-NEXT:    v_mov_b32_e32 v3, s11
2244; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2245; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2246; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2247; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2248; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2249; GFX9-NEXT:    v_mov_b32_e32 v5, 0
2250; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2251; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2252; GFX9-NEXT:    s_endpgm
2253;
2254; GFX8-LABEL: insertelement_s_v8i16_s_v:
2255; GFX8:       ; %bb.0:
2256; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
2257; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
2258; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2259; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2260; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
2261; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2262; GFX8-NEXT:    v_mov_b32_e32 v1, s8
2263; GFX8-NEXT:    v_mov_b32_e32 v2, s9
2264; GFX8-NEXT:    v_mov_b32_e32 v3, s10
2265; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2266; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
2267; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2268; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
2269; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
2270; GFX8-NEXT:    v_mov_b32_e32 v5, s11
2271; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2272; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
2273; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
2274; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
2275; GFX8-NEXT:    v_not_b32_e32 v0, v0
2276; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
2277; GFX8-NEXT:    v_or_b32_e32 v6, v0, v2
2278; GFX8-NEXT:    v_mov_b32_e32 v0, s8
2279; GFX8-NEXT:    v_mov_b32_e32 v1, s9
2280; GFX8-NEXT:    v_mov_b32_e32 v2, s10
2281; GFX8-NEXT:    v_mov_b32_e32 v3, s11
2282; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2283; GFX8-NEXT:    v_mov_b32_e32 v4, 0
2284; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2285; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2286; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2287; GFX8-NEXT:    v_mov_b32_e32 v5, 0
2288; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2289; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2290; GFX8-NEXT:    s_endpgm
2291;
2292; GFX7-LABEL: insertelement_s_v8i16_s_v:
2293; GFX7:       ; %bb.0:
2294; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
2295; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
2296; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2297; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
2298; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2299; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2300; GFX7-NEXT:    v_mov_b32_e32 v1, s8
2301; GFX7-NEXT:    v_mov_b32_e32 v2, s9
2302; GFX7-NEXT:    v_mov_b32_e32 v3, s10
2303; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2304; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2305; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
2306; GFX7-NEXT:    v_mov_b32_e32 v5, s11
2307; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
2308; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2309; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
2310; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
2311; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
2312; GFX7-NEXT:    v_not_b32_e32 v0, v0
2313; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
2314; GFX7-NEXT:    v_or_b32_e32 v5, v0, v2
2315; GFX7-NEXT:    v_mov_b32_e32 v0, s8
2316; GFX7-NEXT:    v_mov_b32_e32 v1, s9
2317; GFX7-NEXT:    v_mov_b32_e32 v2, s10
2318; GFX7-NEXT:    v_mov_b32_e32 v3, s11
2319; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2320; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
2321; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2322; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2323; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
2324; GFX7-NEXT:    s_mov_b64 s[0:1], 0
2325; GFX7-NEXT:    s_mov_b32 s2, -1
2326; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2327; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2328; GFX7-NEXT:    s_endpgm
2329;
2330; GFX10-LABEL: insertelement_s_v8i16_s_v:
2331; GFX10:       ; %bb.0:
2332; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
2333; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v0
2334; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
2335; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
2336; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
2337; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2338; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
2339; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
2340; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
2341; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, s1
2342; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
2343; GFX10-NEXT:    v_not_b32_e32 v5, v2
2344; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2345; GFX10-NEXT:    v_mov_b32_e32 v0, s9
2346; GFX10-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
2347; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, s0
2348; GFX10-NEXT:    v_cndmask_b32_e64 v7, v0, s11, s1
2349; GFX10-NEXT:    v_mov_b32_e32 v0, s8
2350; GFX10-NEXT:    v_mov_b32_e32 v1, s9
2351; GFX10-NEXT:    v_mov_b32_e32 v2, s10
2352; GFX10-NEXT:    v_mov_b32_e32 v3, s11
2353; GFX10-NEXT:    v_and_or_b32 v7, v7, v5, v4
2354; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2355; GFX10-NEXT:    v_mov_b32_e32 v5, 0
2356; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
2357; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
2358; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
2359; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
2360; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2361; GFX10-NEXT:    s_endpgm
2362;
2363; GFX11-LABEL: insertelement_s_v8i16_s_v:
2364; GFX11:       ; %bb.0:
2365; GFX11-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
2366; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 1, v0
2367; GFX11-NEXT:    v_and_b32_e32 v1, 1, v0
2368; GFX11-NEXT:    s_and_b32 s1, s4, 0xffff
2369; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2370; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
2371; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
2372; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
2373; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2374; GFX11-NEXT:    v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1
2375; GFX11-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
2376; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2377; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
2378; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v1, s1
2379; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
2380; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s10, s0
2381; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
2382; GFX11-NEXT:    v_not_b32_e32 v5, v2
2383; GFX11-NEXT:    v_cndmask_b32_e64 v7, v0, s11, s1
2384; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
2385; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
2386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2387; GFX11-NEXT:    v_and_or_b32 v7, v7, v5, v4
2388; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2389; GFX11-NEXT:    v_mov_b32_e32 v5, 0
2390; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
2391; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
2392; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
2393; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
2394; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2395; GFX11-NEXT:    s_endpgm
2396  %vec = load <8 x i16>, ptr addrspace(4) %ptr
2397  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2398  store <8 x i16> %insert, ptr addrspace(1) null
2399  ret void
2400}
2401
2402define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) {
2403; GFX9-LABEL: insertelement_s_v8i16_v_v:
2404; GFX9:       ; %bb.0:
2405; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2406; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
2407; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2408; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
2409; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2411; GFX9-NEXT:    v_mov_b32_e32 v2, s4
2412; GFX9-NEXT:    v_mov_b32_e32 v3, s5
2413; GFX9-NEXT:    v_mov_b32_e32 v5, s6
2414; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2415; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2416; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
2417; GFX9-NEXT:    v_mov_b32_e32 v6, s7
2418; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2419; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2420; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2421; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
2422; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2423; GFX9-NEXT:    v_not_b32_e32 v1, v1
2424; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
2425; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2426; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2427; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2428; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2429; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2430; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2431; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2432; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2433; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2434; GFX9-NEXT:    v_mov_b32_e32 v5, 0
2435; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2436; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2437; GFX9-NEXT:    s_endpgm
2438;
2439; GFX8-LABEL: insertelement_s_v8i16_v_v:
2440; GFX8:       ; %bb.0:
2441; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2442; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
2443; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2444; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
2445; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2446; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2447; GFX8-NEXT:    v_mov_b32_e32 v2, s4
2448; GFX8-NEXT:    v_mov_b32_e32 v3, s5
2449; GFX8-NEXT:    v_mov_b32_e32 v5, s6
2450; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2451; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2452; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
2453; GFX8-NEXT:    v_mov_b32_e32 v6, s7
2454; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2455; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2456; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2457; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
2458; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2459; GFX8-NEXT:    v_not_b32_e32 v1, v1
2460; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
2461; GFX8-NEXT:    v_or_b32_e32 v6, v1, v0
2462; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2463; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2464; GFX8-NEXT:    v_mov_b32_e32 v2, s6
2465; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2466; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2467; GFX8-NEXT:    v_mov_b32_e32 v4, 0
2468; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2469; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2470; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2471; GFX8-NEXT:    v_mov_b32_e32 v5, 0
2472; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2473; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2474; GFX8-NEXT:    s_endpgm
2475;
2476; GFX7-LABEL: insertelement_s_v8i16_v_v:
2477; GFX7:       ; %bb.0:
2478; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2479; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
2480; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2481; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
2482; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2483; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2484; GFX7-NEXT:    v_mov_b32_e32 v2, s4
2485; GFX7-NEXT:    v_mov_b32_e32 v3, s5
2486; GFX7-NEXT:    v_mov_b32_e32 v5, s6
2487; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2488; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2489; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2490; GFX7-NEXT:    v_mov_b32_e32 v6, s7
2491; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2492; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2493; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
2494; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
2495; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2496; GFX7-NEXT:    v_not_b32_e32 v1, v1
2497; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
2498; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
2499; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2500; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2501; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2502; GFX7-NEXT:    v_mov_b32_e32 v3, s7
2503; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2504; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
2505; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2506; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2507; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
2508; GFX7-NEXT:    s_mov_b64 s[0:1], 0
2509; GFX7-NEXT:    s_mov_b32 s2, -1
2510; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2511; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2512; GFX7-NEXT:    s_endpgm
2513;
2514; GFX10-LABEL: insertelement_s_v8i16_v_v:
2515; GFX10:       ; %bb.0:
2516; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2517; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
2518; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
2519; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
2520; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
2521; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
2522; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
2523; GFX10-NEXT:    s_mov_b32 null, 0
2524; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
2525; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, 0xffff
2526; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2527; GFX10-NEXT:    v_not_b32_e32 v5, v3
2528; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2529; GFX10-NEXT:    v_mov_b32_e32 v1, s5
2530; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v1, vcc_lo
2531; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s6, s0
2532; GFX10-NEXT:    v_cndmask_b32_e64 v7, v1, s7, s1
2533; GFX10-NEXT:    v_mov_b32_e32 v0, s4
2534; GFX10-NEXT:    v_mov_b32_e32 v1, s5
2535; GFX10-NEXT:    v_mov_b32_e32 v2, s6
2536; GFX10-NEXT:    v_mov_b32_e32 v3, s7
2537; GFX10-NEXT:    v_and_or_b32 v7, v7, v5, v4
2538; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2539; GFX10-NEXT:    v_mov_b32_e32 v5, 0
2540; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
2541; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
2542; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
2543; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
2544; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2545; GFX10-NEXT:    s_endpgm
2546;
2547; GFX11-LABEL: insertelement_s_v8i16_v_v:
2548; GFX11:       ; %bb.0:
2549; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
2550; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
2551; GFX11-NEXT:    v_and_b32_e32 v2, 1, v1
2552; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2553; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2554; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
2555; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
2556; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
2557; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
2558; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2559; GFX11-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v2, 4, v2
2560; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2561; GFX11-NEXT:    v_cndmask_b32_e32 v1, s4, v1, vcc_lo
2562; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v2, 0xffff
2563; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v2, v0
2564; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2565; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s6, s0
2566; GFX11-NEXT:    v_not_b32_e32 v5, v3
2567; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2568; GFX11-NEXT:    v_cndmask_b32_e64 v7, v1, s7, s1
2569; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2570; GFX11-NEXT:    v_mov_b32_e32 v3, s7
2571; GFX11-NEXT:    v_and_or_b32 v7, v7, v5, v4
2572; GFX11-NEXT:    v_mov_b32_e32 v2, s6
2573; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2574; GFX11-NEXT:    v_mov_b32_e32 v5, 0
2575; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
2576; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
2577; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
2578; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
2579; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
2580; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2581; GFX11-NEXT:    s_endpgm
2582  %vec = load <8 x i16>, ptr addrspace(4) %ptr
2583  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2584  store <8 x i16> %insert, ptr addrspace(1) null
2585  ret void
2586}
2587
2588define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) {
2589; GFX9-LABEL: insertelement_v_v8i16_s_v:
2590; GFX9:       ; %bb.0:
2591; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2592; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
2593; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
2594; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
2595; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
2596; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
2597; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
2598; GFX9-NEXT:    v_lshlrev_b32_e64 v9, v2, s0
2599; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
2600; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
2601; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
2602; GFX9-NEXT:    v_not_b32_e32 v0, v0
2603; GFX9-NEXT:    v_mov_b32_e32 v7, 0
2604; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2605; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2606; GFX9-NEXT:    s_waitcnt vmcnt(0)
2607; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
2608; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2609; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2610; GFX9-NEXT:    v_and_or_b32 v9, v2, v0, v9
2611; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2612; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2613; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2614; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2615; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2616; GFX9-NEXT:    s_endpgm
2617;
2618; GFX8-LABEL: insertelement_v_v8i16_s_v:
2619; GFX8:       ; %bb.0:
2620; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
2621; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
2622; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
2623; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
2624; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
2625; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
2626; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
2627; GFX8-NEXT:    v_lshlrev_b32_e64 v9, v2, s0
2628; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
2629; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
2630; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
2631; GFX8-NEXT:    v_not_b32_e32 v0, v0
2632; GFX8-NEXT:    v_mov_b32_e32 v7, 0
2633; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2634; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2635; GFX8-NEXT:    s_waitcnt vmcnt(0)
2636; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
2637; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2638; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2639; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
2640; GFX8-NEXT:    v_or_b32_e32 v9, v0, v9
2641; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2642; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2643; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2644; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2645; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
2646; GFX8-NEXT:    s_endpgm
2647;
2648; GFX7-LABEL: insertelement_v_v8i16_s_v:
2649; GFX7:       ; %bb.0:
2650; GFX7-NEXT:    s_mov_b32 s10, 0
2651; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2652; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2653; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
2654; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
2655; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
2656; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
2657; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2658; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2659; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
2660; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2661; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
2662; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2663; GFX7-NEXT:    v_not_b32_e32 v1, v1
2664; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2665; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2666; GFX7-NEXT:    s_mov_b32 s10, -1
2667; GFX7-NEXT:    s_waitcnt vmcnt(0)
2668; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
2669; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
2670; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
2671; GFX7-NEXT:    v_and_b32_e32 v1, v7, v1
2672; GFX7-NEXT:    v_or_b32_e32 v7, v1, v2
2673; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
2674; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
2675; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
2676; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
2677; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2678; GFX7-NEXT:    s_endpgm
2679;
2680; GFX10-LABEL: insertelement_v_v8i16_s_v:
2681; GFX10:       ; %bb.0:
2682; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2683; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
2684; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
2685; GFX10-NEXT:    s_and_b32 s1, s2, 0xffff
2686; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
2687; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2688; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
2689; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
2690; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v0, 0xffff
2691; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
2692; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
2693; GFX10-NEXT:    v_not_b32_e32 v7, v7
2694; GFX10-NEXT:    s_waitcnt vmcnt(0)
2695; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
2696; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
2697; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
2698; GFX10-NEXT:    v_and_or_b32 v9, v2, v7, v0
2699; GFX10-NEXT:    v_mov_b32_e32 v7, 0
2700; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2701; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
2702; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
2703; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
2704; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
2705; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2706; GFX10-NEXT:    s_endpgm
2707;
2708; GFX11-LABEL: insertelement_v_v8i16_s_v:
2709; GFX11:       ; %bb.0:
2710; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
2711; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
2712; GFX11-NEXT:    v_and_b32_e32 v0, 1, v2
2713; GFX11-NEXT:    s_and_b32 s1, s2, 0xffff
2714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2715; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
2716; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2717; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
2718; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
2719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2720; GFX11-NEXT:    v_lshlrev_b32_e64 v7, v0, 0xffff
2721; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
2722; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
2723; GFX11-NEXT:    v_not_b32_e32 v7, v7
2724; GFX11-NEXT:    s_waitcnt vmcnt(0)
2725; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
2726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2727; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
2728; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
2729; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2730; GFX11-NEXT:    v_and_or_b32 v9, v2, v7, v0
2731; GFX11-NEXT:    v_mov_b32_e32 v7, 0
2732; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v4, v9
2733; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
2734; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
2735; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
2736; GFX11-NEXT:    global_store_b128 v[7:8], v[0:3], off
2737; GFX11-NEXT:    s_endpgm
2738  %vec = load <8 x i16>, ptr addrspace(1) %ptr
2739  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2740  store <8 x i16> %insert, ptr addrspace(1) null
2741  ret void
2742}
2743
2744define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) {
2745; GFX9-LABEL: insertelement_v_v8i16_v_s:
2746; GFX9:       ; %bb.0:
2747; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2748; GFX9-NEXT:    s_and_b32 s0, s2, 1
2749; GFX9-NEXT:    s_lshr_b32 s4, s2, 1
2750; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
2751; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2752; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
2753; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
2754; GFX9-NEXT:    s_not_b32 s5, s0
2755; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
2756; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
2757; GFX9-NEXT:    v_mov_b32_e32 v7, 0
2758; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2759; GFX9-NEXT:    s_waitcnt vmcnt(0)
2760; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2761; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2762; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
2763; GFX9-NEXT:    v_and_or_b32 v9, v1, s5, v0
2764; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
2765; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2766; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2767; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2768; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2769; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2770; GFX9-NEXT:    s_endpgm
2771;
2772; GFX8-LABEL: insertelement_v_v8i16_v_s:
2773; GFX8:       ; %bb.0:
2774; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
2775; GFX8-NEXT:    s_and_b32 s0, s2, 1
2776; GFX8-NEXT:    s_lshr_b32 s4, s2, 1
2777; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
2778; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2779; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
2780; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
2781; GFX8-NEXT:    s_not_b32 s5, s0
2782; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
2783; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
2784; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2785; GFX8-NEXT:    v_mov_b32_e32 v7, 0
2786; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2787; GFX8-NEXT:    s_waitcnt vmcnt(0)
2788; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2789; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2790; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
2791; GFX8-NEXT:    v_and_b32_e32 v1, s5, v1
2792; GFX8-NEXT:    v_or_b32_e32 v9, v1, v0
2793; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
2794; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2795; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2796; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2797; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2798; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
2799; GFX8-NEXT:    s_endpgm
2800;
2801; GFX7-LABEL: insertelement_v_v8i16_v_s:
2802; GFX7:       ; %bb.0:
2803; GFX7-NEXT:    s_mov_b32 s10, 0
2804; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2805; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2806; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
2807; GFX7-NEXT:    s_and_b32 s0, s2, 1
2808; GFX7-NEXT:    s_lshr_b32 s4, s2, 1
2809; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
2810; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
2811; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
2812; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
2813; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
2814; GFX7-NEXT:    s_not_b32 s5, s0
2815; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
2816; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
2817; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2818; GFX7-NEXT:    s_mov_b32 s10, -1
2819; GFX7-NEXT:    s_waitcnt vmcnt(0)
2820; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2821; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2822; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
2823; GFX7-NEXT:    v_and_b32_e32 v1, s5, v1
2824; GFX7-NEXT:    v_or_b32_e32 v7, v1, v0
2825; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
2826; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
2827; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
2828; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
2829; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
2830; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2831; GFX7-NEXT:    s_endpgm
2832;
2833; GFX10-LABEL: insertelement_v_v8i16_v_s:
2834; GFX10:       ; %bb.0:
2835; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2836; GFX10-NEXT:    s_lshr_b32 s3, s2, 1
2837; GFX10-NEXT:    s_and_b32 s1, s2, 1
2838; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s3, 1
2839; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 2
2840; GFX10-NEXT:    s_lshl_b32 s2, s1, 4
2841; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s3, 3
2842; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2843; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s2
2844; GFX10-NEXT:    v_mov_b32_e32 v7, 0
2845; GFX10-NEXT:    s_not_b32 s2, s2
2846; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2847; GFX10-NEXT:    s_waitcnt vmcnt(0)
2848; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
2849; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
2850; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
2851; GFX10-NEXT:    v_and_or_b32 v9, v0, s2, v1
2852; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s3, 0
2853; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
2854; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
2855; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
2856; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
2857; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2858; GFX10-NEXT:    s_endpgm
2859;
2860; GFX11-LABEL: insertelement_v_v8i16_v_s:
2861; GFX11:       ; %bb.0:
2862; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
2863; GFX11-NEXT:    s_lshr_b32 s3, s2, 1
2864; GFX11-NEXT:    s_and_b32 s1, s2, 1
2865; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s3, 1
2866; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s3, 2
2867; GFX11-NEXT:    s_lshl_b32 s2, s1, 4
2868; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, s3, 3
2869; GFX11-NEXT:    v_mov_b32_e32 v7, 0
2870; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v1, 0xffff, v2
2871; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2872; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
2873; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s2
2874; GFX11-NEXT:    s_not_b32 s2, s2
2875; GFX11-NEXT:    s_waitcnt vmcnt(0)
2876; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
2877; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2878; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
2879; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
2880; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2881; GFX11-NEXT:    v_and_or_b32 v9, v0, s2, v1
2882; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, s3, 0
2883; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
2884; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2885; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
2886; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
2887; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
2888; GFX11-NEXT:    global_store_b128 v[7:8], v[0:3], off
2889; GFX11-NEXT:    s_endpgm
2890  %vec = load <8 x i16>, ptr addrspace(1) %ptr
2891  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2892  store <8 x i16> %insert, ptr addrspace(1) null
2893  ret void
2894}
2895
2896define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) {
2897; GFX9-LABEL: insertelement_v_v8i16_v_v:
2898; GFX9:       ; %bb.0:
2899; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
2900; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
2901; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
2902; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
2903; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
2904; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
2905; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2906; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
2907; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
2908; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
2909; GFX9-NEXT:    v_not_b32_e32 v0, v0
2910; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2911; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2912; GFX9-NEXT:    v_mov_b32_e32 v9, 0
2913; GFX9-NEXT:    s_waitcnt vmcnt(0)
2914; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2915; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2916; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
2917; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v2
2918; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
2919; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
2920; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
2921; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
2922; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
2923; GFX9-NEXT:    s_endpgm
2924;
2925; GFX8-LABEL: insertelement_v_v8i16_v_v:
2926; GFX8:       ; %bb.0:
2927; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
2928; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
2929; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
2930; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
2931; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
2932; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
2933; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2934; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
2935; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
2936; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
2937; GFX8-NEXT:    v_not_b32_e32 v0, v0
2938; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2939; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2940; GFX8-NEXT:    v_mov_b32_e32 v9, 0
2941; GFX8-NEXT:    s_waitcnt vmcnt(0)
2942; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2943; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2944; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
2945; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
2946; GFX8-NEXT:    v_or_b32_e32 v3, v0, v2
2947; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
2948; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
2949; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
2950; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
2951; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2952; GFX8-NEXT:    s_endpgm
2953;
2954; GFX7-LABEL: insertelement_v_v8i16_v_v:
2955; GFX7:       ; %bb.0:
2956; GFX7-NEXT:    s_mov_b32 s10, 0
2957; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2958; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2959; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
2960; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
2961; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
2962; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2963; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2964; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2965; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2966; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
2967; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
2968; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2969; GFX7-NEXT:    v_not_b32_e32 v1, v1
2970; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2971; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2972; GFX7-NEXT:    s_mov_b32 s10, -1
2973; GFX7-NEXT:    s_waitcnt vmcnt(0)
2974; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2975; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2976; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
2977; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
2978; GFX7-NEXT:    v_or_b32_e32 v3, v1, v2
2979; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
2980; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
2981; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
2982; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
2983; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2984; GFX7-NEXT:    s_endpgm
2985;
2986; GFX10-LABEL: insertelement_v_v8i16_v_v:
2987; GFX10:       ; %bb.0:
2988; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
2989; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
2990; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
2991; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
2992; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2993; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
2994; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
2995; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
2996; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, 0xffff
2997; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2998; GFX10-NEXT:    v_not_b32_e32 v2, v8
2999; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3000; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3001; GFX10-NEXT:    s_waitcnt vmcnt(0)
3002; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
3003; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
3004; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
3005; GFX10-NEXT:    v_and_or_b32 v3, v3, v2, v0
3006; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
3007; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
3008; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
3009; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s1
3010; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3011; GFX10-NEXT:    s_endpgm
3012;
3013; GFX11-LABEL: insertelement_v_v8i16_v_v:
3014; GFX11:       ; %bb.0:
3015; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
3016; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
3017; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3018; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
3019; GFX11-NEXT:    v_and_b32_e32 v0, 1, v3
3020; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
3021; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3022; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
3023; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
3024; GFX11-NEXT:    s_waitcnt vmcnt(0)
3025; GFX11-NEXT:    v_dual_cndmask_b32 v3, v4, v5 :: v_dual_lshlrev_b32 v0, 4, v0
3026; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
3027; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
3028; GFX11-NEXT:    v_lshlrev_b32_e64 v8, v0, 0xffff
3029; GFX11-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
3030; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3031; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
3032; GFX11-NEXT:    v_not_b32_e32 v2, v8
3033; GFX11-NEXT:    v_mov_b32_e32 v8, 0
3034; GFX11-NEXT:    v_mov_b32_e32 v9, 0
3035; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3036; GFX11-NEXT:    v_and_or_b32 v3, v3, v2, v0
3037; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
3038; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
3039; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
3040; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s1
3041; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
3042; GFX11-NEXT:    s_endpgm
3043  %vec = load <8 x i16>, ptr addrspace(1) %ptr
3044  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
3045  store <8 x i16> %insert, ptr addrspace(1) null
3046  ret void
3047}
3048
3049define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) {
3050; GFX9-LABEL: insertelement_s_v16i16_s_s:
3051; GFX9:       ; %bb.0:
3052; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3053; GFX9-NEXT:    s_lshr_b32 s7, s5, 1
3054; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
3055; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3056; GFX9-NEXT:    v_mov_b32_e32 v5, 0
3057; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3058; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
3059; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
3060; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
3061; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
3062; GFX9-NEXT:    s_cselect_b32 s0, s11, s0
3063; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
3064; GFX9-NEXT:    s_cselect_b32 s0, s12, s0
3065; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
3066; GFX9-NEXT:    s_cselect_b32 s0, s13, s0
3067; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
3068; GFX9-NEXT:    s_cselect_b32 s0, s14, s0
3069; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
3070; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
3071; GFX9-NEXT:    s_and_b32 s1, s5, 1
3072; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
3073; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
3074; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
3075; GFX9-NEXT:    s_lshl_b32 s1, 0xffff, s1
3076; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
3077; GFX9-NEXT:    s_or_b32 s16, s0, s2
3078; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
3079; GFX9-NEXT:    s_cselect_b32 s0, s16, s8
3080; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
3081; GFX9-NEXT:    s_cselect_b32 s1, s16, s9
3082; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
3083; GFX9-NEXT:    s_cselect_b32 s2, s16, s10
3084; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
3085; GFX9-NEXT:    s_cselect_b32 s3, s16, s11
3086; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
3087; GFX9-NEXT:    s_cselect_b32 s4, s16, s12
3088; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
3089; GFX9-NEXT:    s_cselect_b32 s5, s16, s13
3090; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
3091; GFX9-NEXT:    v_mov_b32_e32 v0, s0
3092; GFX9-NEXT:    s_cselect_b32 s6, s16, s14
3093; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
3094; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3095; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3096; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3097; GFX9-NEXT:    s_cselect_b32 s7, s16, s15
3098; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
3099; GFX9-NEXT:    v_mov_b32_e32 v4, 16
3100; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3101; GFX9-NEXT:    v_mov_b32_e32 v5, 0
3102; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3103; GFX9-NEXT:    v_mov_b32_e32 v2, s6
3104; GFX9-NEXT:    v_mov_b32_e32 v3, s7
3105; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
3106; GFX9-NEXT:    s_endpgm
3107;
3108; GFX8-LABEL: insertelement_s_v16i16_s_s:
3109; GFX8:       ; %bb.0:
3110; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3111; GFX8-NEXT:    s_and_b32 s1, s5, 1
3112; GFX8-NEXT:    s_lshr_b32 m0, s5, 1
3113; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
3114; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
3115; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3116; GFX8-NEXT:    s_movrels_b32 s0, s8
3117; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
3118; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
3119; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
3120; GFX8-NEXT:    s_or_b32 s0, s0, s2
3121; GFX8-NEXT:    s_movreld_b32 s8, s0
3122; GFX8-NEXT:    v_mov_b32_e32 v4, 0
3123; GFX8-NEXT:    v_mov_b32_e32 v0, s8
3124; GFX8-NEXT:    v_mov_b32_e32 v5, 0
3125; GFX8-NEXT:    v_mov_b32_e32 v1, s9
3126; GFX8-NEXT:    v_mov_b32_e32 v2, s10
3127; GFX8-NEXT:    v_mov_b32_e32 v3, s11
3128; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3129; GFX8-NEXT:    v_mov_b32_e32 v4, 16
3130; GFX8-NEXT:    v_mov_b32_e32 v0, s12
3131; GFX8-NEXT:    v_mov_b32_e32 v5, 0
3132; GFX8-NEXT:    v_mov_b32_e32 v1, s13
3133; GFX8-NEXT:    v_mov_b32_e32 v2, s14
3134; GFX8-NEXT:    v_mov_b32_e32 v3, s15
3135; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3136; GFX8-NEXT:    s_endpgm
3137;
3138; GFX7-LABEL: insertelement_s_v16i16_s_s:
3139; GFX7:       ; %bb.0:
3140; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3141; GFX7-NEXT:    s_and_b32 s1, s5, 1
3142; GFX7-NEXT:    s_lshr_b32 m0, s5, 1
3143; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
3144; GFX7-NEXT:    s_and_b32 s2, s4, 0xffff
3145; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3146; GFX7-NEXT:    s_movrels_b32 s0, s8
3147; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
3148; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
3149; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
3150; GFX7-NEXT:    s_or_b32 s0, s0, s2
3151; GFX7-NEXT:    s_movreld_b32 s8, s0
3152; GFX7-NEXT:    v_mov_b32_e32 v0, s8
3153; GFX7-NEXT:    s_mov_b64 s[0:1], 0
3154; GFX7-NEXT:    v_mov_b32_e32 v1, s9
3155; GFX7-NEXT:    v_mov_b32_e32 v2, s10
3156; GFX7-NEXT:    v_mov_b32_e32 v3, s11
3157; GFX7-NEXT:    s_mov_b32 s2, -1
3158; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3159; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3160; GFX7-NEXT:    s_mov_b64 s[0:1], 16
3161; GFX7-NEXT:    v_mov_b32_e32 v0, s12
3162; GFX7-NEXT:    v_mov_b32_e32 v1, s13
3163; GFX7-NEXT:    v_mov_b32_e32 v2, s14
3164; GFX7-NEXT:    v_mov_b32_e32 v3, s15
3165; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3166; GFX7-NEXT:    s_endpgm
3167;
3168; GFX10-LABEL: insertelement_s_v16i16_s_s:
3169; GFX10:       ; %bb.0:
3170; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3171; GFX10-NEXT:    s_and_b32 s0, s5, 1
3172; GFX10-NEXT:    s_lshr_b32 m0, s5, 1
3173; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
3174; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
3175; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s0
3176; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
3177; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3178; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3179; GFX10-NEXT:    v_mov_b32_e32 v10, 16
3180; GFX10-NEXT:    v_mov_b32_e32 v11, 0
3181; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3182; GFX10-NEXT:    s_movrels_b32 s3, s8
3183; GFX10-NEXT:    s_andn2_b32 s1, s3, s2
3184; GFX10-NEXT:    s_or_b32 s0, s1, s0
3185; GFX10-NEXT:    s_movreld_b32 s8, s0
3186; GFX10-NEXT:    v_mov_b32_e32 v0, s8
3187; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3188; GFX10-NEXT:    v_mov_b32_e32 v2, s10
3189; GFX10-NEXT:    v_mov_b32_e32 v3, s11
3190; GFX10-NEXT:    v_mov_b32_e32 v4, s12
3191; GFX10-NEXT:    v_mov_b32_e32 v5, s13
3192; GFX10-NEXT:    v_mov_b32_e32 v6, s14
3193; GFX10-NEXT:    v_mov_b32_e32 v7, s15
3194; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3195; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
3196; GFX10-NEXT:    s_endpgm
3197;
3198; GFX11-LABEL: insertelement_s_v16i16_s_s:
3199; GFX11:       ; %bb.0:
3200; GFX11-NEXT:    s_load_b256 s[8:15], s[2:3], 0x0
3201; GFX11-NEXT:    s_and_b32 s0, s5, 1
3202; GFX11-NEXT:    s_lshr_b32 m0, s5, 1
3203; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
3204; GFX11-NEXT:    s_and_b32 s1, s4, 0xffff
3205; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s0
3206; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
3207; GFX11-NEXT:    v_mov_b32_e32 v8, 0
3208; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
3209; GFX11-NEXT:    v_mov_b32_e32 v11, 0
3210; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3211; GFX11-NEXT:    s_movrels_b32 s3, s8
3212; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3213; GFX11-NEXT:    s_and_not1_b32 s1, s3, s2
3214; GFX11-NEXT:    s_or_b32 s0, s1, s0
3215; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3216; GFX11-NEXT:    s_movreld_b32 s8, s0
3217; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
3218; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
3219; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
3220; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
3221; GFX11-NEXT:    s_clause 0x1
3222; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
3223; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
3224; GFX11-NEXT:    s_endpgm
3225  %vec = load <16 x i16>, ptr addrspace(4) %ptr
3226  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3227  store <16 x i16> %insert, ptr addrspace(1) null
3228  ret void
3229}
3230
3231define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) {
3232; GFX9-LABEL: insertelement_v_v16i16_s_s:
3233; GFX9:       ; %bb.0:
3234; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
3235; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
3236; GFX9-NEXT:    s_and_b32 s0, s3, 1
3237; GFX9-NEXT:    s_lshr_b32 s12, s3, 1
3238; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
3239; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
3240; GFX9-NEXT:    s_lshl_b32 s1, s1, s0
3241; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
3242; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
3243; GFX9-NEXT:    s_not_b32 s13, s0
3244; GFX9-NEXT:    v_mov_b32_e32 v0, s1
3245; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
3246; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
3247; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
3248; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
3249; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
3250; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
3251; GFX9-NEXT:    v_mov_b32_e32 v10, 0
3252; GFX9-NEXT:    v_mov_b32_e32 v11, 0
3253; GFX9-NEXT:    v_mov_b32_e32 v12, 16
3254; GFX9-NEXT:    v_mov_b32_e32 v13, 0
3255; GFX9-NEXT:    s_waitcnt vmcnt(1)
3256; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
3257; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
3258; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
3259; GFX9-NEXT:    s_waitcnt vmcnt(0)
3260; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
3261; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
3262; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[8:9]
3263; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3264; GFX9-NEXT:    v_and_or_b32 v14, v1, s13, v0
3265; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
3266; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v14, s[12:13]
3267; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v14, vcc
3268; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s[0:1]
3269; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v14, s[2:3]
3270; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v14, s[4:5]
3271; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v14, s[6:7]
3272; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v14, s[8:9]
3273; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v14, s[10:11]
3274; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off
3275; GFX9-NEXT:    global_store_dwordx4 v[12:13], v[4:7], off
3276; GFX9-NEXT:    s_endpgm
3277;
3278; GFX8-LABEL: insertelement_v_v16i16_s_s:
3279; GFX8:       ; %bb.0:
3280; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
3281; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3282; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3283; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
3284; GFX8-NEXT:    s_and_b32 s0, s3, 1
3285; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
3286; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
3287; GFX8-NEXT:    s_lshr_b32 m0, s3, 1
3288; GFX8-NEXT:    s_lshl_b32 s1, s1, s0
3289; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
3290; GFX8-NEXT:    s_not_b32 s0, s0
3291; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3292; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3293; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3294; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3295; GFX8-NEXT:    s_waitcnt vmcnt(0)
3296; GFX8-NEXT:    v_movrels_b32_e32 v12, v0
3297; GFX8-NEXT:    v_and_b32_e32 v12, s0, v12
3298; GFX8-NEXT:    v_or_b32_e32 v12, s1, v12
3299; GFX8-NEXT:    v_movreld_b32_e32 v0, v12
3300; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3301; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3302; GFX8-NEXT:    s_endpgm
3303;
3304; GFX7-LABEL: insertelement_v_v16i16_s_s:
3305; GFX7:       ; %bb.0:
3306; GFX7-NEXT:    s_mov_b32 s6, 0
3307; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3308; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3309; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
3310; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
3311; GFX7-NEXT:    s_and_b32 s0, s3, 1
3312; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
3313; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
3314; GFX7-NEXT:    s_lshr_b32 m0, s3, 1
3315; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
3316; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
3317; GFX7-NEXT:    s_not_b32 s0, s0
3318; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3319; GFX7-NEXT:    s_mov_b32 s6, -1
3320; GFX7-NEXT:    s_waitcnt vmcnt(0)
3321; GFX7-NEXT:    v_movrels_b32_e32 v0, v2
3322; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
3323; GFX7-NEXT:    v_or_b32_e32 v0, s1, v0
3324; GFX7-NEXT:    v_movreld_b32_e32 v2, v0
3325; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
3326; GFX7-NEXT:    s_mov_b64 s[4:5], 16
3327; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], off, s[4:7], 0
3328; GFX7-NEXT:    s_endpgm
3329;
3330; GFX10-LABEL: insertelement_v_v16i16_s_s:
3331; GFX10:       ; %bb.0:
3332; GFX10-NEXT:    s_clause 0x1
3333; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
3334; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
3335; GFX10-NEXT:    s_and_b32 s0, s3, 1
3336; GFX10-NEXT:    s_lshr_b32 m0, s3, 1
3337; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
3338; GFX10-NEXT:    s_and_b32 s1, s2, 0xffff
3339; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s0
3340; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
3341; GFX10-NEXT:    s_not_b32 s1, s2
3342; GFX10-NEXT:    v_mov_b32_e32 v10, 16
3343; GFX10-NEXT:    v_mov_b32_e32 v11, 0
3344; GFX10-NEXT:    s_waitcnt vmcnt(0)
3345; GFX10-NEXT:    v_movrels_b32_e32 v0, v2
3346; GFX10-NEXT:    v_and_or_b32 v12, v0, s1, s0
3347; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3348; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3349; GFX10-NEXT:    v_movreld_b32_e32 v2, v12
3350; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
3351; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[6:9], off
3352; GFX10-NEXT:    s_endpgm
3353;
3354; GFX11-LABEL: insertelement_v_v16i16_s_s:
3355; GFX11:       ; %bb.0:
3356; GFX11-NEXT:    s_clause 0x1
3357; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off
3358; GFX11-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:16
3359; GFX11-NEXT:    s_and_b32 s0, s3, 1
3360; GFX11-NEXT:    s_lshr_b32 m0, s3, 1
3361; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
3362; GFX11-NEXT:    s_and_b32 s1, s2, 0xffff
3363; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s0
3364; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
3365; GFX11-NEXT:    s_not_b32 s1, s2
3366; GFX11-NEXT:    v_mov_b32_e32 v10, 16
3367; GFX11-NEXT:    v_mov_b32_e32 v11, 0
3368; GFX11-NEXT:    s_waitcnt vmcnt(0)
3369; GFX11-NEXT:    v_movrels_b32_e32 v0, v2
3370; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3371; GFX11-NEXT:    v_and_or_b32 v12, v0, s1, s0
3372; GFX11-NEXT:    v_mov_b32_e32 v0, 0
3373; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3374; GFX11-NEXT:    v_movreld_b32_e32 v2, v12
3375; GFX11-NEXT:    s_clause 0x1
3376; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
3377; GFX11-NEXT:    global_store_b128 v[10:11], v[6:9], off
3378; GFX11-NEXT:    s_endpgm
3379  %vec = load <16 x i16>, ptr addrspace(1 ) %ptr
3380  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3381  store <16 x i16> %insert, ptr addrspace(1) null
3382  ret void
3383}
3384
3385define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) {
3386; GFX9-LABEL: insertelement_s_v16i16_v_s:
3387; GFX9:       ; %bb.0:
3388; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3389; GFX9-NEXT:    s_lshr_b32 s0, s4, 1
3390; GFX9-NEXT:    s_cmp_eq_u32 s0, 1
3391; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3392; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
3393; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3394; GFX9-NEXT:    s_cselect_b32 s1, s9, s8
3395; GFX9-NEXT:    s_cmp_eq_u32 s0, 2
3396; GFX9-NEXT:    s_cselect_b32 s1, s10, s1
3397; GFX9-NEXT:    s_cmp_eq_u32 s0, 3
3398; GFX9-NEXT:    s_cselect_b32 s1, s11, s1
3399; GFX9-NEXT:    s_cmp_eq_u32 s0, 4
3400; GFX9-NEXT:    s_cselect_b32 s1, s12, s1
3401; GFX9-NEXT:    s_cmp_eq_u32 s0, 5
3402; GFX9-NEXT:    s_cselect_b32 s1, s13, s1
3403; GFX9-NEXT:    s_cmp_eq_u32 s0, 6
3404; GFX9-NEXT:    s_cselect_b32 s1, s14, s1
3405; GFX9-NEXT:    s_cmp_eq_u32 s0, 7
3406; GFX9-NEXT:    s_cselect_b32 s1, s15, s1
3407; GFX9-NEXT:    s_and_b32 s2, s4, 1
3408; GFX9-NEXT:    s_lshl_b32 s2, s2, 4
3409; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s2
3410; GFX9-NEXT:    s_andn2_b32 s1, s1, s3
3411; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3412; GFX9-NEXT:    v_lshl_or_b32 v8, v0, s2, v1
3413; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3414; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3415; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
3416; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
3417; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3418; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
3419; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
3420; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3421; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
3422; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
3423; GFX9-NEXT:    v_mov_b32_e32 v4, s12
3424; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
3425; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 4
3426; GFX9-NEXT:    v_mov_b32_e32 v5, s13
3427; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
3428; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 5
3429; GFX9-NEXT:    v_mov_b32_e32 v6, s14
3430; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
3431; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 6
3432; GFX9-NEXT:    v_mov_b32_e32 v7, s15
3433; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
3434; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 7
3435; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
3436; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3437; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3438; GFX9-NEXT:    v_mov_b32_e32 v10, 16
3439; GFX9-NEXT:    v_mov_b32_e32 v11, 0
3440; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3441; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
3442; GFX9-NEXT:    s_endpgm
3443;
3444; GFX8-LABEL: insertelement_s_v16i16_v_s:
3445; GFX8:       ; %bb.0:
3446; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3447; GFX8-NEXT:    s_and_b32 s1, s4, 1
3448; GFX8-NEXT:    s_lshr_b32 m0, s4, 1
3449; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
3450; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3451; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3452; GFX8-NEXT:    s_movrels_b32 s0, s8
3453; GFX8-NEXT:    s_lshl_b32 s1, 0xffff, s1
3454; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3455; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
3456; GFX8-NEXT:    v_or_b32_e32 v8, s0, v0
3457; GFX8-NEXT:    v_mov_b32_e32 v0, s8
3458; GFX8-NEXT:    v_mov_b32_e32 v1, s9
3459; GFX8-NEXT:    v_mov_b32_e32 v2, s10
3460; GFX8-NEXT:    v_mov_b32_e32 v3, s11
3461; GFX8-NEXT:    v_mov_b32_e32 v4, s12
3462; GFX8-NEXT:    v_mov_b32_e32 v5, s13
3463; GFX8-NEXT:    v_mov_b32_e32 v6, s14
3464; GFX8-NEXT:    v_mov_b32_e32 v7, s15
3465; GFX8-NEXT:    v_movreld_b32_e32 v0, v8
3466; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3467; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3468; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3469; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3470; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3471; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3472; GFX8-NEXT:    s_endpgm
3473;
3474; GFX7-LABEL: insertelement_s_v16i16_v_s:
3475; GFX7:       ; %bb.0:
3476; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3477; GFX7-NEXT:    s_and_b32 s1, s4, 1
3478; GFX7-NEXT:    s_lshr_b32 m0, s4, 1
3479; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
3480; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3481; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3482; GFX7-NEXT:    s_movrels_b32 s0, s8
3483; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
3484; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
3485; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
3486; GFX7-NEXT:    v_or_b32_e32 v8, s0, v0
3487; GFX7-NEXT:    v_mov_b32_e32 v0, s8
3488; GFX7-NEXT:    v_mov_b32_e32 v1, s9
3489; GFX7-NEXT:    v_mov_b32_e32 v2, s10
3490; GFX7-NEXT:    v_mov_b32_e32 v3, s11
3491; GFX7-NEXT:    v_mov_b32_e32 v4, s12
3492; GFX7-NEXT:    v_mov_b32_e32 v5, s13
3493; GFX7-NEXT:    v_mov_b32_e32 v6, s14
3494; GFX7-NEXT:    v_mov_b32_e32 v7, s15
3495; GFX7-NEXT:    s_mov_b64 s[0:1], 0
3496; GFX7-NEXT:    s_mov_b32 s2, -1
3497; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3498; GFX7-NEXT:    v_movreld_b32_e32 v0, v8
3499; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3500; GFX7-NEXT:    s_mov_b64 s[0:1], 16
3501; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
3502; GFX7-NEXT:    s_endpgm
3503;
3504; GFX10-LABEL: insertelement_s_v16i16_v_s:
3505; GFX10:       ; %bb.0:
3506; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3507; GFX10-NEXT:    s_and_b32 s0, s4, 1
3508; GFX10-NEXT:    s_lshr_b32 m0, s4, 1
3509; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
3510; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff, v0
3511; GFX10-NEXT:    s_lshl_b32 s1, 0xffff, s0
3512; GFX10-NEXT:    v_mov_b32_e32 v10, 16
3513; GFX10-NEXT:    v_mov_b32_e32 v11, 0
3514; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3515; GFX10-NEXT:    s_movrels_b32 s2, s8
3516; GFX10-NEXT:    v_mov_b32_e32 v0, s8
3517; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
3518; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3519; GFX10-NEXT:    v_lshl_or_b32 v12, v8, s0, s1
3520; GFX10-NEXT:    v_mov_b32_e32 v2, s10
3521; GFX10-NEXT:    v_mov_b32_e32 v3, s11
3522; GFX10-NEXT:    v_mov_b32_e32 v4, s12
3523; GFX10-NEXT:    v_mov_b32_e32 v5, s13
3524; GFX10-NEXT:    v_mov_b32_e32 v6, s14
3525; GFX10-NEXT:    v_mov_b32_e32 v7, s15
3526; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3527; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3528; GFX10-NEXT:    v_movreld_b32_e32 v0, v12
3529; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3530; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
3531; GFX10-NEXT:    s_endpgm
3532;
3533; GFX11-LABEL: insertelement_s_v16i16_v_s:
3534; GFX11:       ; %bb.0:
3535; GFX11-NEXT:    s_load_b256 s[8:15], s[2:3], 0x0
3536; GFX11-NEXT:    s_and_b32 s0, s4, 1
3537; GFX11-NEXT:    s_lshr_b32 m0, s4, 1
3538; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
3539; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v0
3540; GFX11-NEXT:    s_lshl_b32 s1, 0xffff, s0
3541; GFX11-NEXT:    v_mov_b32_e32 v10, 16
3542; GFX11-NEXT:    v_mov_b32_e32 v11, 0
3543; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3544; GFX11-NEXT:    s_movrels_b32 s2, s8
3545; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
3546; GFX11-NEXT:    s_and_not1_b32 s1, s2, s1
3547; GFX11-NEXT:    v_mov_b32_e32 v1, s9
3548; GFX11-NEXT:    v_lshl_or_b32 v12, v8, s0, s1
3549; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v5, s13
3550; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15
3551; GFX11-NEXT:    v_mov_b32_e32 v6, s14
3552; GFX11-NEXT:    v_mov_b32_e32 v8, 0
3553; GFX11-NEXT:    v_mov_b32_e32 v9, 0
3554; GFX11-NEXT:    v_movreld_b32_e32 v0, v12
3555; GFX11-NEXT:    s_clause 0x1
3556; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
3557; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
3558; GFX11-NEXT:    s_endpgm
3559  %vec = load <16 x i16>, ptr addrspace(4) %ptr
3560  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3561  store <16 x i16> %insert, ptr addrspace(1) null
3562  ret void
3563}
3564
3565define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) {
3566; GFX9-LABEL: insertelement_s_v16i16_s_v:
3567; GFX9:       ; %bb.0:
3568; GFX9-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
3569; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
3570; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3571; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3572; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3574; GFX9-NEXT:    v_mov_b32_e32 v1, s16
3575; GFX9-NEXT:    v_mov_b32_e32 v2, s17
3576; GFX9-NEXT:    v_mov_b32_e32 v3, s18
3577; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3578; GFX9-NEXT:    v_mov_b32_e32 v4, s19
3579; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3580; GFX9-NEXT:    v_mov_b32_e32 v5, s20
3581; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
3582; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
3583; GFX9-NEXT:    v_mov_b32_e32 v6, s21
3584; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
3585; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3586; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
3587; GFX9-NEXT:    v_mov_b32_e32 v7, s22
3588; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
3589; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3590; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3591; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
3592; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
3593; GFX9-NEXT:    v_mov_b32_e32 v9, s23
3594; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
3595; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3596; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
3597; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
3598; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3599; GFX9-NEXT:    v_not_b32_e32 v0, v0
3600; GFX9-NEXT:    v_and_or_b32 v9, v1, v0, v2
3601; GFX9-NEXT:    v_mov_b32_e32 v0, s16
3602; GFX9-NEXT:    v_mov_b32_e32 v1, s17
3603; GFX9-NEXT:    v_mov_b32_e32 v2, s18
3604; GFX9-NEXT:    v_mov_b32_e32 v3, s19
3605; GFX9-NEXT:    v_mov_b32_e32 v4, s20
3606; GFX9-NEXT:    v_mov_b32_e32 v5, s21
3607; GFX9-NEXT:    v_mov_b32_e32 v6, s22
3608; GFX9-NEXT:    v_mov_b32_e32 v7, s23
3609; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3610; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3611; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3612; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3613; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3614; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
3615; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3616; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3617; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3618; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3619; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3620; GFX9-NEXT:    v_mov_b32_e32 v10, 16
3621; GFX9-NEXT:    v_mov_b32_e32 v11, 0
3622; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3623; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
3624; GFX9-NEXT:    s_endpgm
3625;
3626; GFX8-LABEL: insertelement_s_v16i16_s_v:
3627; GFX8:       ; %bb.0:
3628; GFX8-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
3629; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
3630; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3631; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3632; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3633; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3634; GFX8-NEXT:    v_mov_b32_e32 v1, s16
3635; GFX8-NEXT:    v_mov_b32_e32 v2, s17
3636; GFX8-NEXT:    v_mov_b32_e32 v3, s18
3637; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3638; GFX8-NEXT:    v_mov_b32_e32 v4, s19
3639; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3640; GFX8-NEXT:    v_mov_b32_e32 v5, s20
3641; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
3642; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
3643; GFX8-NEXT:    v_mov_b32_e32 v6, s21
3644; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
3645; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3646; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
3647; GFX8-NEXT:    v_mov_b32_e32 v7, s22
3648; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
3649; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3650; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3651; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
3652; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
3653; GFX8-NEXT:    v_mov_b32_e32 v9, s23
3654; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
3655; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3656; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
3657; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
3658; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3659; GFX8-NEXT:    v_not_b32_e32 v0, v0
3660; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
3661; GFX8-NEXT:    v_or_b32_e32 v9, v0, v2
3662; GFX8-NEXT:    v_mov_b32_e32 v0, s16
3663; GFX8-NEXT:    v_mov_b32_e32 v1, s17
3664; GFX8-NEXT:    v_mov_b32_e32 v2, s18
3665; GFX8-NEXT:    v_mov_b32_e32 v3, s19
3666; GFX8-NEXT:    v_mov_b32_e32 v4, s20
3667; GFX8-NEXT:    v_mov_b32_e32 v5, s21
3668; GFX8-NEXT:    v_mov_b32_e32 v6, s22
3669; GFX8-NEXT:    v_mov_b32_e32 v7, s23
3670; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3671; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3672; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3673; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3674; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3675; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
3676; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3677; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3678; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3679; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3680; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3681; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3682; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3683; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3684; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3685; GFX8-NEXT:    s_endpgm
3686;
3687; GFX7-LABEL: insertelement_s_v16i16_s_v:
3688; GFX7:       ; %bb.0:
3689; GFX7-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
3690; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
3691; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3692; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3693; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3694; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3695; GFX7-NEXT:    v_mov_b32_e32 v1, s16
3696; GFX7-NEXT:    v_mov_b32_e32 v2, s17
3697; GFX7-NEXT:    v_mov_b32_e32 v3, s18
3698; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3699; GFX7-NEXT:    v_mov_b32_e32 v4, s19
3700; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3701; GFX7-NEXT:    v_mov_b32_e32 v5, s20
3702; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
3703; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
3704; GFX7-NEXT:    v_mov_b32_e32 v6, s21
3705; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
3706; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3707; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
3708; GFX7-NEXT:    v_mov_b32_e32 v7, s22
3709; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
3710; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3711; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3712; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
3713; GFX7-NEXT:    v_mov_b32_e32 v9, s23
3714; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
3715; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3716; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
3717; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
3718; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3719; GFX7-NEXT:    v_not_b32_e32 v0, v0
3720; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
3721; GFX7-NEXT:    v_or_b32_e32 v9, v0, v2
3722; GFX7-NEXT:    v_mov_b32_e32 v0, s16
3723; GFX7-NEXT:    v_mov_b32_e32 v1, s17
3724; GFX7-NEXT:    v_mov_b32_e32 v2, s18
3725; GFX7-NEXT:    v_mov_b32_e32 v3, s19
3726; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3727; GFX7-NEXT:    v_mov_b32_e32 v4, s20
3728; GFX7-NEXT:    v_mov_b32_e32 v5, s21
3729; GFX7-NEXT:    v_mov_b32_e32 v6, s22
3730; GFX7-NEXT:    v_mov_b32_e32 v7, s23
3731; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3732; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3733; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3734; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3735; GFX7-NEXT:    s_mov_b64 s[0:1], 0
3736; GFX7-NEXT:    s_mov_b32 s2, -1
3737; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3738; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
3739; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3740; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3741; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3742; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3743; GFX7-NEXT:    s_mov_b64 s[0:1], 16
3744; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
3745; GFX7-NEXT:    s_endpgm
3746;
3747; GFX10-LABEL: insertelement_s_v16i16_s_v:
3748; GFX10:       ; %bb.0:
3749; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3750; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 1, v0
3751; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
3752; GFX10-NEXT:    s_and_b32 s5, s4, 0xffff
3753; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
3754; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
3755; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
3756; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
3757; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
3758; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3759; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
3760; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
3761; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
3762; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s5
3763; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
3764; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3765; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3766; GFX10-NEXT:    v_not_b32_e32 v9, v2
3767; GFX10-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
3768; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
3769; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s11, s1
3770; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, s2
3771; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s13, s3
3772; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s14, s4
3773; GFX10-NEXT:    v_cndmask_b32_e64 v10, v1, s15, s5
3774; GFX10-NEXT:    v_mov_b32_e32 v0, s8
3775; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3776; GFX10-NEXT:    v_mov_b32_e32 v2, s10
3777; GFX10-NEXT:    v_mov_b32_e32 v3, s11
3778; GFX10-NEXT:    v_and_or_b32 v13, v10, v9, v8
3779; GFX10-NEXT:    v_mov_b32_e32 v4, s12
3780; GFX10-NEXT:    v_mov_b32_e32 v5, s13
3781; GFX10-NEXT:    v_mov_b32_e32 v6, s14
3782; GFX10-NEXT:    v_mov_b32_e32 v7, s15
3783; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3784; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3785; GFX10-NEXT:    v_mov_b32_e32 v10, 16
3786; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
3787; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
3788; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
3789; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
3790; GFX10-NEXT:    v_mov_b32_e32 v11, 0
3791; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
3792; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
3793; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
3794; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s5
3795; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3796; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
3797; GFX10-NEXT:    s_endpgm
3798;
3799; GFX11-LABEL: insertelement_s_v16i16_s_v:
3800; GFX11:       ; %bb.0:
3801; GFX11-NEXT:    s_load_b256 s[8:15], s[2:3], 0x0
3802; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 1, v0
3803; GFX11-NEXT:    s_and_b32 s5, s4, 0xffff
3804; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
3805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3806; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
3807; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
3808; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
3809; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
3810; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
3811; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
3812; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
3813; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3814; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3815; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
3816; GFX11-NEXT:    v_lshlrev_b32_e64 v8, v0, s5
3817; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3818; GFX11-NEXT:    v_mov_b32_e32 v1, s9
3819; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
3820; GFX11-NEXT:    v_not_b32_e32 v9, v2
3821; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3822; GFX11-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
3823; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
3824; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3825; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s11, s1
3826; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s12, s2
3827; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3828; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s13, s3
3829; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s14, s4
3830; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3831; GFX11-NEXT:    v_cndmask_b32_e64 v10, v1, s15, s5
3832; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
3833; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
3834; GFX11-NEXT:    v_and_or_b32 v13, v10, v9, v8
3835; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
3836; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
3837; GFX11-NEXT:    v_mov_b32_e32 v8, 0
3838; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
3839; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
3840; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
3841; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
3842; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
3843; GFX11-NEXT:    v_mov_b32_e32 v11, 0
3844; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
3845; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
3846; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
3847; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s5
3848; GFX11-NEXT:    s_clause 0x1
3849; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
3850; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
3851; GFX11-NEXT:    s_endpgm
3852  %vec = load <16 x i16>, ptr addrspace(4) %ptr
3853  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3854  store <16 x i16> %insert, ptr addrspace(1) null
3855  ret void
3856}
3857
3858define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) {
3859; GFX9-LABEL: insertelement_s_v16i16_v_v:
3860; GFX9:       ; %bb.0:
3861; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
3862; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
3863; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3864; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3865; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3866; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3867; GFX9-NEXT:    v_mov_b32_e32 v2, s12
3868; GFX9-NEXT:    v_mov_b32_e32 v3, s13
3869; GFX9-NEXT:    v_mov_b32_e32 v4, s14
3870; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3871; GFX9-NEXT:    v_mov_b32_e32 v5, s15
3872; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
3873; GFX9-NEXT:    v_mov_b32_e32 v6, s16
3874; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
3875; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
3876; GFX9-NEXT:    v_mov_b32_e32 v7, s17
3877; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
3878; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3879; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
3880; GFX9-NEXT:    v_mov_b32_e32 v9, s18
3881; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
3882; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3883; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3884; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
3885; GFX9-NEXT:    v_mov_b32_e32 v10, s19
3886; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
3887; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3888; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3889; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
3890; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
3891; GFX9-NEXT:    v_not_b32_e32 v1, v1
3892; GFX9-NEXT:    v_and_or_b32 v9, v2, v1, v0
3893; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3894; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3895; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3896; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3897; GFX9-NEXT:    v_mov_b32_e32 v4, s16
3898; GFX9-NEXT:    v_mov_b32_e32 v5, s17
3899; GFX9-NEXT:    v_mov_b32_e32 v6, s18
3900; GFX9-NEXT:    v_mov_b32_e32 v7, s19
3901; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3902; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3903; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3904; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3905; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3906; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
3907; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3908; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3909; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3910; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3911; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3912; GFX9-NEXT:    v_mov_b32_e32 v10, 16
3913; GFX9-NEXT:    v_mov_b32_e32 v11, 0
3914; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3915; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
3916; GFX9-NEXT:    s_endpgm
3917;
3918; GFX8-LABEL: insertelement_s_v16i16_v_v:
3919; GFX8:       ; %bb.0:
3920; GFX8-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
3921; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
3922; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3923; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3924; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3925; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3926; GFX8-NEXT:    v_mov_b32_e32 v2, s12
3927; GFX8-NEXT:    v_mov_b32_e32 v3, s13
3928; GFX8-NEXT:    v_mov_b32_e32 v4, s14
3929; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3930; GFX8-NEXT:    v_mov_b32_e32 v5, s15
3931; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
3932; GFX8-NEXT:    v_mov_b32_e32 v6, s16
3933; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
3934; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
3935; GFX8-NEXT:    v_mov_b32_e32 v7, s17
3936; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
3937; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3938; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
3939; GFX8-NEXT:    v_mov_b32_e32 v9, s18
3940; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
3941; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3942; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3943; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
3944; GFX8-NEXT:    v_mov_b32_e32 v10, s19
3945; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
3946; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3947; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3948; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
3949; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
3950; GFX8-NEXT:    v_not_b32_e32 v1, v1
3951; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
3952; GFX8-NEXT:    v_or_b32_e32 v9, v1, v0
3953; GFX8-NEXT:    v_mov_b32_e32 v0, s12
3954; GFX8-NEXT:    v_mov_b32_e32 v1, s13
3955; GFX8-NEXT:    v_mov_b32_e32 v2, s14
3956; GFX8-NEXT:    v_mov_b32_e32 v3, s15
3957; GFX8-NEXT:    v_mov_b32_e32 v4, s16
3958; GFX8-NEXT:    v_mov_b32_e32 v5, s17
3959; GFX8-NEXT:    v_mov_b32_e32 v6, s18
3960; GFX8-NEXT:    v_mov_b32_e32 v7, s19
3961; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3962; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3963; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3964; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3965; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3966; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
3967; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3968; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3969; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3970; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3971; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3972; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3973; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3974; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3975; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3976; GFX8-NEXT:    s_endpgm
3977;
3978; GFX7-LABEL: insertelement_s_v16i16_v_v:
3979; GFX7:       ; %bb.0:
3980; GFX7-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
3981; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
3982; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3983; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3984; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3985; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3986; GFX7-NEXT:    v_mov_b32_e32 v2, s12
3987; GFX7-NEXT:    v_mov_b32_e32 v3, s13
3988; GFX7-NEXT:    v_mov_b32_e32 v4, s14
3989; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3990; GFX7-NEXT:    v_mov_b32_e32 v5, s15
3991; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
3992; GFX7-NEXT:    v_mov_b32_e32 v6, s16
3993; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
3994; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
3995; GFX7-NEXT:    v_mov_b32_e32 v7, s17
3996; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
3997; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3998; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
3999; GFX7-NEXT:    v_mov_b32_e32 v9, s18
4000; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
4001; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
4002; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4003; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4004; GFX7-NEXT:    v_mov_b32_e32 v10, s19
4005; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
4006; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
4007; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
4008; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
4009; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
4010; GFX7-NEXT:    v_not_b32_e32 v1, v1
4011; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
4012; GFX7-NEXT:    v_or_b32_e32 v9, v1, v0
4013; GFX7-NEXT:    v_mov_b32_e32 v0, s12
4014; GFX7-NEXT:    v_mov_b32_e32 v1, s13
4015; GFX7-NEXT:    v_mov_b32_e32 v2, s14
4016; GFX7-NEXT:    v_mov_b32_e32 v3, s15
4017; GFX7-NEXT:    v_mov_b32_e32 v4, s16
4018; GFX7-NEXT:    v_mov_b32_e32 v5, s17
4019; GFX7-NEXT:    v_mov_b32_e32 v6, s18
4020; GFX7-NEXT:    v_mov_b32_e32 v7, s19
4021; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
4022; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
4023; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
4024; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
4025; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
4026; GFX7-NEXT:    s_mov_b64 s[0:1], 0
4027; GFX7-NEXT:    s_mov_b32 s2, -1
4028; GFX7-NEXT:    s_mov_b32 s3, 0xf000
4029; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
4030; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
4031; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
4032; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
4033; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4034; GFX7-NEXT:    s_mov_b64 s[0:1], 16
4035; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
4036; GFX7-NEXT:    s_endpgm
4037;
4038; GFX10-LABEL: insertelement_s_v16i16_v_v:
4039; GFX10:       ; %bb.0:
4040; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
4041; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 1, v1
4042; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
4043; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
4044; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
4045; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
4046; GFX10-NEXT:    s_mov_b32 null, 0
4047; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
4048; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
4049; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4050; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
4051; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
4052; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
4053; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, 0xffff
4054; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4055; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4056; GFX10-NEXT:    v_mov_b32_e32 v2, s9
4057; GFX10-NEXT:    v_not_b32_e32 v9, v3
4058; GFX10-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc_lo
4059; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
4060; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, s1
4061; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s2
4062; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s13, s3
4063; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s4
4064; GFX10-NEXT:    v_cndmask_b32_e64 v10, v2, s15, s5
4065; GFX10-NEXT:    v_mov_b32_e32 v0, s8
4066; GFX10-NEXT:    v_mov_b32_e32 v1, s9
4067; GFX10-NEXT:    v_mov_b32_e32 v2, s10
4068; GFX10-NEXT:    v_mov_b32_e32 v3, s11
4069; GFX10-NEXT:    v_and_or_b32 v13, v10, v9, v8
4070; GFX10-NEXT:    v_mov_b32_e32 v4, s12
4071; GFX10-NEXT:    v_mov_b32_e32 v5, s13
4072; GFX10-NEXT:    v_mov_b32_e32 v6, s14
4073; GFX10-NEXT:    v_mov_b32_e32 v7, s15
4074; GFX10-NEXT:    v_mov_b32_e32 v8, 0
4075; GFX10-NEXT:    v_mov_b32_e32 v9, 0
4076; GFX10-NEXT:    v_mov_b32_e32 v10, 16
4077; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
4078; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
4079; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
4080; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
4081; GFX10-NEXT:    v_mov_b32_e32 v11, 0
4082; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
4083; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
4084; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
4085; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s5
4086; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
4087; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
4088; GFX10-NEXT:    s_endpgm
4089;
4090; GFX11-LABEL: insertelement_s_v16i16_v_v:
4091; GFX11:       ; %bb.0:
4092; GFX11-NEXT:    s_load_b256 s[8:15], s[2:3], 0x0
4093; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 1, v1
4094; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4095; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
4096; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
4097; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
4098; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
4099; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
4100; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
4101; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
4102; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
4103; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
4104; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
4105; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4106; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
4107; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v1, 0xffff
4108; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4109; GFX11-NEXT:    v_mov_b32_e32 v2, s9
4110; GFX11-NEXT:    v_lshlrev_b32_e32 v8, v1, v0
4111; GFX11-NEXT:    v_not_b32_e32 v9, v3
4112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
4113; GFX11-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc_lo
4114; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
4115; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4116; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, s1
4117; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s2
4118; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4119; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s13, s3
4120; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s4
4121; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4122; GFX11-NEXT:    v_cndmask_b32_e64 v10, v2, s15, s5
4123; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13
4124; GFX11-NEXT:    v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
4125; GFX11-NEXT:    v_mov_b32_e32 v7, s15
4126; GFX11-NEXT:    v_mov_b32_e32 v3, s11
4127; GFX11-NEXT:    v_and_or_b32 v13, v10, v9, v8
4128; GFX11-NEXT:    v_mov_b32_e32 v4, s12
4129; GFX11-NEXT:    v_mov_b32_e32 v6, s14
4130; GFX11-NEXT:    v_mov_b32_e32 v8, 0
4131; GFX11-NEXT:    v_mov_b32_e32 v9, 0
4132; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v13 :: v_dual_mov_b32 v10, 16
4133; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
4134; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
4135; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
4136; GFX11-NEXT:    v_mov_b32_e32 v11, 0
4137; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
4138; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
4139; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
4140; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s5
4141; GFX11-NEXT:    s_clause 0x1
4142; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
4143; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
4144; GFX11-NEXT:    s_endpgm
4145  %vec = load <16 x i16>, ptr addrspace(4) %ptr
4146  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
4147  store <16 x i16> %insert, ptr addrspace(1) null
4148  ret void
4149}
4150
4151define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) {
4152; GFX9-LABEL: insertelement_v_v16i16_s_v:
4153; GFX9:       ; %bb.0:
4154; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
4155; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
4156; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
4157; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
4158; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
4159; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
4160; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
4161; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
4162; GFX9-NEXT:    v_lshlrev_b32_e64 v15, v2, s0
4163; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
4164; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
4165; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
4166; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
4167; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
4168; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
4169; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
4170; GFX9-NEXT:    v_not_b32_e32 v0, v0
4171; GFX9-NEXT:    v_mov_b32_e32 v11, 0
4172; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
4173; GFX9-NEXT:    v_mov_b32_e32 v12, 0
4174; GFX9-NEXT:    v_mov_b32_e32 v13, 16
4175; GFX9-NEXT:    v_mov_b32_e32 v14, 0
4176; GFX9-NEXT:    s_waitcnt vmcnt(1)
4177; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
4178; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
4179; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
4180; GFX9-NEXT:    s_waitcnt vmcnt(0)
4181; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
4182; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
4183; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
4184; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
4185; GFX9-NEXT:    v_and_or_b32 v15, v2, v0, v15
4186; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s[12:13]
4187; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
4188; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s[0:1]
4189; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s[2:3]
4190; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v15, s[4:5]
4191; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v15, s[6:7]
4192; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s[8:9]
4193; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v15, s[10:11]
4194; GFX9-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off
4195; GFX9-NEXT:    global_store_dwordx4 v[13:14], v[4:7], off
4196; GFX9-NEXT:    s_endpgm
4197;
4198; GFX8-LABEL: insertelement_v_v16i16_s_v:
4199; GFX8:       ; %bb.0:
4200; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
4201; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
4202; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4203; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
4204; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
4205; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
4206; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
4207; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
4208; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
4209; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
4210; GFX8-NEXT:    v_lshlrev_b32_e64 v15, v2, s0
4211; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
4212; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
4213; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
4214; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
4215; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
4216; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
4217; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
4218; GFX8-NEXT:    v_not_b32_e32 v0, v0
4219; GFX8-NEXT:    v_mov_b32_e32 v11, 0
4220; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
4221; GFX8-NEXT:    v_mov_b32_e32 v12, 0
4222; GFX8-NEXT:    v_mov_b32_e32 v13, 16
4223; GFX8-NEXT:    v_mov_b32_e32 v14, 0
4224; GFX8-NEXT:    s_waitcnt vmcnt(1)
4225; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
4226; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
4227; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
4228; GFX8-NEXT:    s_waitcnt vmcnt(0)
4229; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
4230; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
4231; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
4232; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
4233; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
4234; GFX8-NEXT:    v_or_b32_e32 v15, v0, v15
4235; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s[12:13]
4236; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
4237; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s[0:1]
4238; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s[2:3]
4239; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v15, s[4:5]
4240; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v15, s[6:7]
4241; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s[8:9]
4242; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v15, s[10:11]
4243; GFX8-NEXT:    flat_store_dwordx4 v[11:12], v[0:3]
4244; GFX8-NEXT:    flat_store_dwordx4 v[13:14], v[4:7]
4245; GFX8-NEXT:    s_endpgm
4246;
4247; GFX7-LABEL: insertelement_v_v16i16_s_v:
4248; GFX7:       ; %bb.0:
4249; GFX7-NEXT:    s_mov_b32 s18, 0
4250; GFX7-NEXT:    s_mov_b32 s19, 0xf000
4251; GFX7-NEXT:    s_mov_b64 s[16:17], 0
4252; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
4253; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
4254; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
4255; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
4256; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
4257; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4258; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
4259; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
4260; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
4261; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
4262; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
4263; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
4264; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
4265; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
4266; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
4267; GFX7-NEXT:    v_not_b32_e32 v1, v1
4268; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
4269; GFX7-NEXT:    s_mov_b64 s[16:17], 0
4270; GFX7-NEXT:    s_mov_b32 s18, -1
4271; GFX7-NEXT:    s_waitcnt vmcnt(1)
4272; GFX7-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
4273; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
4274; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
4275; GFX7-NEXT:    s_waitcnt vmcnt(0)
4276; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
4277; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
4278; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
4279; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
4280; GFX7-NEXT:    v_and_b32_e32 v1, v11, v1
4281; GFX7-NEXT:    v_or_b32_e32 v11, v1, v2
4282; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
4283; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
4284; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
4285; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
4286; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
4287; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
4288; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
4289; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
4290; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
4291; GFX7-NEXT:    s_mov_b64 s[16:17], 16
4292; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
4293; GFX7-NEXT:    s_endpgm
4294;
4295; GFX10-LABEL: insertelement_v_v16i16_s_v:
4296; GFX10:       ; %bb.0:
4297; GFX10-NEXT:    s_clause 0x1
4298; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
4299; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
4300; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
4301; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
4302; GFX10-NEXT:    s_and_b32 s5, s2, 0xffff
4303; GFX10-NEXT:    v_mov_b32_e32 v13, 16
4304; GFX10-NEXT:    v_mov_b32_e32 v14, 0
4305; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
4306; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
4307; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
4308; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 4, v0
4309; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v0
4310; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
4311; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
4312; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
4313; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v2, 0xffff
4314; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, s5
4315; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
4316; GFX10-NEXT:    v_not_b32_e32 v11, v11
4317; GFX10-NEXT:    s_waitcnt vmcnt(1)
4318; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
4319; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
4320; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s1
4321; GFX10-NEXT:    s_waitcnt vmcnt(0)
4322; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
4323; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s4
4324; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
4325; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s5
4326; GFX10-NEXT:    v_and_or_b32 v15, v1, v11, v2
4327; GFX10-NEXT:    v_mov_b32_e32 v11, 0
4328; GFX10-NEXT:    v_mov_b32_e32 v12, 0
4329; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s6
4330; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc_lo
4331; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s0
4332; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s1
4333; GFX10-NEXT:    v_cndmask_b32_e64 v4, v7, v15, s3
4334; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v15, s4
4335; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s2
4336; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v15, s5
4337; GFX10-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off
4338; GFX10-NEXT:    global_store_dwordx4 v[13:14], v[4:7], off
4339; GFX10-NEXT:    s_endpgm
4340;
4341; GFX11-LABEL: insertelement_v_v16i16_s_v:
4342; GFX11:       ; %bb.0:
4343; GFX11-NEXT:    s_clause 0x1
4344; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
4345; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
4346; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
4347; GFX11-NEXT:    s_and_b32 s5, s2, 0xffff
4348; GFX11-NEXT:    v_dual_mov_b32 v13, 16 :: v_dual_and_b32 v2, 1, v2
4349; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4350; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
4351; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
4352; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
4353; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 4, v0
4354; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 5, v0
4355; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
4356; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
4357; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
4358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4359; GFX11-NEXT:    v_lshlrev_b32_e64 v11, v2, 0xffff
4360; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, s5
4361; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
4362; GFX11-NEXT:    v_not_b32_e32 v11, v11
4363; GFX11-NEXT:    s_waitcnt vmcnt(1)
4364; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
4365; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4366; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
4367; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s1
4368; GFX11-NEXT:    s_waitcnt vmcnt(0)
4369; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4370; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
4371; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s4
4372; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4373; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
4374; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s5
4375; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4376; GFX11-NEXT:    v_and_or_b32 v15, v1, v11, v2
4377; GFX11-NEXT:    v_mov_b32_e32 v11, 0
4378; GFX11-NEXT:    v_mov_b32_e32 v12, 0
4379; GFX11-NEXT:    v_dual_mov_b32 v14, 0 :: v_dual_cndmask_b32 v1, v4, v15
4380; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s6
4381; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s0
4382; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s1
4383; GFX11-NEXT:    v_cndmask_b32_e64 v4, v7, v15, s3
4384; GFX11-NEXT:    v_cndmask_b32_e64 v5, v8, v15, s4
4385; GFX11-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s2
4386; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, v15, s5
4387; GFX11-NEXT:    s_clause 0x1
4388; GFX11-NEXT:    global_store_b128 v[11:12], v[0:3], off
4389; GFX11-NEXT:    global_store_b128 v[13:14], v[4:7], off
4390; GFX11-NEXT:    s_endpgm
4391  %vec = load <16 x i16>, ptr addrspace(1) %ptr
4392  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
4393  store <16 x i16> %insert, ptr addrspace(1) null
4394  ret void
4395}
4396
4397define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) {
4398; GFX9-LABEL: insertelement_v_v16i16_v_s:
4399; GFX9:       ; %bb.0:
4400; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
4401; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
4402; GFX9-NEXT:    s_and_b32 s0, s2, 1
4403; GFX9-NEXT:    s_lshr_b32 s12, s2, 1
4404; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
4405; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4406; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
4407; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
4408; GFX9-NEXT:    s_not_b32 s13, s0
4409; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
4410; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
4411; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
4412; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
4413; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
4414; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
4415; GFX9-NEXT:    v_mov_b32_e32 v11, 0
4416; GFX9-NEXT:    v_mov_b32_e32 v12, 0
4417; GFX9-NEXT:    v_mov_b32_e32 v13, 16
4418; GFX9-NEXT:    v_mov_b32_e32 v14, 0
4419; GFX9-NEXT:    s_waitcnt vmcnt(1)
4420; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
4421; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
4422; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
4423; GFX9-NEXT:    s_waitcnt vmcnt(0)
4424; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
4425; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
4426; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
4427; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
4428; GFX9-NEXT:    v_and_or_b32 v15, v1, s13, v0
4429; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
4430; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s[12:13]
4431; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
4432; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s[0:1]
4433; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s[2:3]
4434; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v15, s[4:5]
4435; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v15, s[6:7]
4436; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s[8:9]
4437; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v15, s[10:11]
4438; GFX9-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off
4439; GFX9-NEXT:    global_store_dwordx4 v[13:14], v[4:7], off
4440; GFX9-NEXT:    s_endpgm
4441;
4442; GFX8-LABEL: insertelement_v_v16i16_v_s:
4443; GFX8:       ; %bb.0:
4444; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
4445; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
4446; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
4447; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
4448; GFX8-NEXT:    s_and_b32 s0, s2, 1
4449; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
4450; GFX8-NEXT:    s_lshr_b32 m0, s2, 1
4451; GFX8-NEXT:    v_mov_b32_e32 v13, s0
4452; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
4453; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4454; GFX8-NEXT:    s_not_b32 s0, s0
4455; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4456; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4457; GFX8-NEXT:    v_mov_b32_e32 v11, 16
4458; GFX8-NEXT:    v_mov_b32_e32 v12, 0
4459; GFX8-NEXT:    s_waitcnt vmcnt(0)
4460; GFX8-NEXT:    v_movrels_b32_e32 v13, v3
4461; GFX8-NEXT:    v_and_b32_e32 v13, s0, v13
4462; GFX8-NEXT:    v_or_b32_e32 v2, v13, v2
4463; GFX8-NEXT:    v_movreld_b32_e32 v3, v2
4464; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
4465; GFX8-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
4466; GFX8-NEXT:    s_endpgm
4467;
4468; GFX7-LABEL: insertelement_v_v16i16_v_s:
4469; GFX7:       ; %bb.0:
4470; GFX7-NEXT:    s_mov_b32 s6, 0
4471; GFX7-NEXT:    s_mov_b32 s7, 0xf000
4472; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4473; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
4474; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16
4475; GFX7-NEXT:    s_and_b32 s0, s2, 1
4476; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
4477; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
4478; GFX7-NEXT:    s_lshr_b32 m0, s2, 1
4479; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
4480; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
4481; GFX7-NEXT:    s_not_b32 s0, s0
4482; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4483; GFX7-NEXT:    s_mov_b32 s6, -1
4484; GFX7-NEXT:    s_waitcnt vmcnt(0)
4485; GFX7-NEXT:    v_movrels_b32_e32 v1, v3
4486; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
4487; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
4488; GFX7-NEXT:    v_movreld_b32_e32 v3, v0
4489; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
4490; GFX7-NEXT:    s_mov_b64 s[4:5], 16
4491; GFX7-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0
4492; GFX7-NEXT:    s_endpgm
4493;
4494; GFX10-LABEL: insertelement_v_v16i16_v_s:
4495; GFX10:       ; %bb.0:
4496; GFX10-NEXT:    s_clause 0x1
4497; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
4498; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
4499; GFX10-NEXT:    s_and_b32 s0, s2, 1
4500; GFX10-NEXT:    s_lshr_b32 m0, s2, 1
4501; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
4502; GFX10-NEXT:    v_mov_b32_e32 v11, 16
4503; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4504; GFX10-NEXT:    s_lshl_b32 s0, 0xffff, s0
4505; GFX10-NEXT:    v_mov_b32_e32 v12, 0
4506; GFX10-NEXT:    s_not_b32 s0, s0
4507; GFX10-NEXT:    s_waitcnt vmcnt(0)
4508; GFX10-NEXT:    v_movrels_b32_e32 v1, v3
4509; GFX10-NEXT:    v_and_or_b32 v2, v1, s0, v0
4510; GFX10-NEXT:    v_mov_b32_e32 v0, 0
4511; GFX10-NEXT:    v_mov_b32_e32 v1, 0
4512; GFX10-NEXT:    v_movreld_b32_e32 v3, v2
4513; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
4514; GFX10-NEXT:    global_store_dwordx4 v[11:12], v[7:10], off
4515; GFX10-NEXT:    s_endpgm
4516;
4517; GFX11-LABEL: insertelement_v_v16i16_v_s:
4518; GFX11:       ; %bb.0:
4519; GFX11-NEXT:    s_clause 0x1
4520; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
4521; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
4522; GFX11-NEXT:    v_dual_mov_b32 v11, 16 :: v_dual_and_b32 v0, 0xffff, v2
4523; GFX11-NEXT:    s_and_b32 s0, s2, 1
4524; GFX11-NEXT:    s_lshr_b32 m0, s2, 1
4525; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
4526; GFX11-NEXT:    v_mov_b32_e32 v12, 0
4527; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
4528; GFX11-NEXT:    s_lshl_b32 s0, 0xffff, s0
4529; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
4530; GFX11-NEXT:    s_not_b32 s0, s0
4531; GFX11-NEXT:    s_waitcnt vmcnt(0)
4532; GFX11-NEXT:    v_movrels_b32_e32 v1, v3
4533; GFX11-NEXT:    v_and_or_b32 v2, v1, s0, v0
4534; GFX11-NEXT:    v_mov_b32_e32 v0, 0
4535; GFX11-NEXT:    v_mov_b32_e32 v1, 0
4536; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
4537; GFX11-NEXT:    v_movreld_b32_e32 v3, v2
4538; GFX11-NEXT:    s_clause 0x1
4539; GFX11-NEXT:    global_store_b128 v[0:1], v[3:6], off
4540; GFX11-NEXT:    global_store_b128 v[11:12], v[7:10], off
4541; GFX11-NEXT:    s_endpgm
4542  %vec = load <16 x i16>, ptr addrspace(1) %ptr
4543  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
4544  store <16 x i16> %insert, ptr addrspace(1) null
4545  ret void
4546}
4547
4548define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) {
4549; GFX9-LABEL: insertelement_v_v16i16_v_v:
4550; GFX9:       ; %bb.0:
4551; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
4552; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
4553; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
4554; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
4555; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
4556; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
4557; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
4558; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4559; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
4560; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
4561; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
4562; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
4563; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
4564; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
4565; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
4566; GFX9-NEXT:    v_not_b32_e32 v0, v0
4567; GFX9-NEXT:    v_mov_b32_e32 v12, 0
4568; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
4569; GFX9-NEXT:    v_mov_b32_e32 v13, 0
4570; GFX9-NEXT:    v_mov_b32_e32 v14, 16
4571; GFX9-NEXT:    v_mov_b32_e32 v15, 0
4572; GFX9-NEXT:    s_waitcnt vmcnt(1)
4573; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
4574; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
4575; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
4576; GFX9-NEXT:    s_waitcnt vmcnt(0)
4577; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
4578; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
4579; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
4580; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
4581; GFX9-NEXT:    v_and_or_b32 v16, v3, v0, v2
4582; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v16, s[12:13]
4583; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v16, vcc
4584; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v16, s[0:1]
4585; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v16, s[2:3]
4586; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v16, s[4:5]
4587; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v16, s[6:7]
4588; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v16, s[8:9]
4589; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v16, s[10:11]
4590; GFX9-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
4591; GFX9-NEXT:    global_store_dwordx4 v[14:15], v[4:7], off
4592; GFX9-NEXT:    s_endpgm
4593;
4594; GFX8-LABEL: insertelement_v_v16i16_v_v:
4595; GFX8:       ; %bb.0:
4596; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
4597; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
4598; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4599; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
4600; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
4601; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
4602; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
4603; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
4604; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
4605; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4606; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
4607; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
4608; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
4609; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
4610; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
4611; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
4612; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
4613; GFX8-NEXT:    v_not_b32_e32 v0, v0
4614; GFX8-NEXT:    v_mov_b32_e32 v12, 0
4615; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
4616; GFX8-NEXT:    v_mov_b32_e32 v13, 0
4617; GFX8-NEXT:    v_mov_b32_e32 v14, 16
4618; GFX8-NEXT:    v_mov_b32_e32 v15, 0
4619; GFX8-NEXT:    s_waitcnt vmcnt(1)
4620; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
4621; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
4622; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
4623; GFX8-NEXT:    s_waitcnt vmcnt(0)
4624; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
4625; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
4626; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
4627; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
4628; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
4629; GFX8-NEXT:    v_or_b32_e32 v16, v0, v2
4630; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v16, s[12:13]
4631; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v16, vcc
4632; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v16, s[0:1]
4633; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v16, s[2:3]
4634; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v16, s[4:5]
4635; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v16, s[6:7]
4636; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v16, s[8:9]
4637; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v16, s[10:11]
4638; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
4639; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
4640; GFX8-NEXT:    s_endpgm
4641;
4642; GFX7-LABEL: insertelement_v_v16i16_v_v:
4643; GFX7:       ; %bb.0:
4644; GFX7-NEXT:    s_mov_b32 s18, 0
4645; GFX7-NEXT:    s_mov_b32 s19, 0xf000
4646; GFX7-NEXT:    s_mov_b64 s[16:17], 0
4647; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
4648; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
4649; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4650; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
4651; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
4652; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
4653; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
4654; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
4655; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
4656; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4657; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4658; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
4659; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
4660; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
4661; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
4662; GFX7-NEXT:    v_not_b32_e32 v1, v1
4663; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
4664; GFX7-NEXT:    s_mov_b64 s[16:17], 0
4665; GFX7-NEXT:    s_mov_b32 s18, -1
4666; GFX7-NEXT:    s_waitcnt vmcnt(1)
4667; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
4668; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
4669; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
4670; GFX7-NEXT:    s_waitcnt vmcnt(0)
4671; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
4672; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
4673; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
4674; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
4675; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
4676; GFX7-NEXT:    v_or_b32_e32 v12, v1, v2
4677; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
4678; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
4679; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
4680; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
4681; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
4682; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
4683; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
4684; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
4685; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
4686; GFX7-NEXT:    s_mov_b64 s[16:17], 16
4687; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
4688; GFX7-NEXT:    s_endpgm
4689;
4690; GFX10-LABEL: insertelement_v_v16i16_v_v:
4691; GFX10:       ; %bb.0:
4692; GFX10-NEXT:    s_clause 0x1
4693; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
4694; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
4695; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4696; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
4697; GFX10-NEXT:    v_mov_b32_e32 v14, 16
4698; GFX10-NEXT:    v_mov_b32_e32 v15, 0
4699; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
4700; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
4701; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
4702; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v0
4703; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v0
4704; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
4705; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v0
4706; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
4707; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
4708; GFX10-NEXT:    v_lshlrev_b32_e64 v12, v3, 0xffff
4709; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4710; GFX10-NEXT:    v_not_b32_e32 v3, v12
4711; GFX10-NEXT:    v_mov_b32_e32 v12, 0
4712; GFX10-NEXT:    v_mov_b32_e32 v13, 0
4713; GFX10-NEXT:    s_waitcnt vmcnt(1)
4714; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
4715; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
4716; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s1
4717; GFX10-NEXT:    s_waitcnt vmcnt(0)
4718; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s2
4719; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s3
4720; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s4
4721; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s5
4722; GFX10-NEXT:    v_and_or_b32 v16, v1, v3, v2
4723; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v16, s6
4724; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v16, vcc_lo
4725; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v16, s0
4726; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v16, s1
4727; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v16, s2
4728; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v16, s3
4729; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v16, s4
4730; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v16, s5
4731; GFX10-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
4732; GFX10-NEXT:    global_store_dwordx4 v[14:15], v[4:7], off
4733; GFX10-NEXT:    s_endpgm
4734;
4735; GFX11-LABEL: insertelement_v_v16i16_v_v:
4736; GFX11:       ; %bb.0:
4737; GFX11-NEXT:    s_clause 0x1
4738; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
4739; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
4740; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4741; GFX11-NEXT:    v_dual_mov_b32 v14, 16 :: v_dual_and_b32 v3, 1, v3
4742; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_and_b32 v2, 0xffff, v2
4743; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
4744; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
4745; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
4746; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
4747; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 4, v0
4748; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 5, v0
4749; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
4750; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 6, v0
4751; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
4752; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
4753; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4754; GFX11-NEXT:    v_lshlrev_b32_e64 v12, v3, 0xffff
4755; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
4756; GFX11-NEXT:    v_not_b32_e32 v3, v12
4757; GFX11-NEXT:    v_mov_b32_e32 v12, 0
4758; GFX11-NEXT:    v_mov_b32_e32 v13, 0
4759; GFX11-NEXT:    s_waitcnt vmcnt(1)
4760; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
4761; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4762; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
4763; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s1
4764; GFX11-NEXT:    s_waitcnt vmcnt(0)
4765; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4766; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s2
4767; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s3
4768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4769; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s4
4770; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s5
4771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4772; GFX11-NEXT:    v_and_or_b32 v16, v1, v3, v2
4773; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v16, s6
4774; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v16, vcc_lo
4775; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v16, s0
4776; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v16, s1
4777; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v16, s2
4778; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v16, s3
4779; GFX11-NEXT:    v_cndmask_b32_e64 v6, v10, v16, s4
4780; GFX11-NEXT:    v_cndmask_b32_e64 v7, v11, v16, s5
4781; GFX11-NEXT:    s_clause 0x1
4782; GFX11-NEXT:    global_store_b128 v[12:13], v[0:3], off
4783; GFX11-NEXT:    global_store_b128 v[14:15], v[4:7], off
4784; GFX11-NEXT:    s_endpgm
4785  %vec = load <16 x i16>, ptr addrspace(1) %ptr
4786  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
4787  store <16 x i16> %insert, ptr addrspace(1) null
4788  ret void
4789}
4790