xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
5; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
6
7define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
8; SI-LABEL: s_insertelement_v2bf16_0:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
11; SI-NEXT:    s_waitcnt lgkmcnt(0)
12; SI-NEXT:    s_load_dword s4, s[2:3], 0x0
13; SI-NEXT:    s_mov_b32 s3, 0x100f000
14; SI-NEXT:    s_mov_b32 s2, -1
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_and_b32 s4, s4, 0xffff0000
17; SI-NEXT:    s_or_b32 s4, s4, 0x40a0
18; SI-NEXT:    v_mov_b32_e32 v0, s4
19; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: s_insertelement_v2bf16_0:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
27; VI-NEXT:    v_mov_b32_e32 v0, s0
28; VI-NEXT:    v_mov_b32_e32 v1, s1
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    s_and_b32 s0, s2, 0xffff0000
31; VI-NEXT:    s_or_b32 s0, s0, 0x40a0
32; VI-NEXT:    v_mov_b32_e32 v2, s0
33; VI-NEXT:    flat_store_dword v[0:1], v2
34; VI-NEXT:    s_endpgm
35;
36; GFX900-LABEL: s_insertelement_v2bf16_0:
37; GFX900:       ; %bb.0:
38; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
39; GFX900-NEXT:    v_mov_b32_e32 v0, 0
40; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX900-NEXT:    s_load_dword s2, s[2:3], 0x0
42; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX900-NEXT:    s_lshr_b32 s2, s2, 16
44; GFX900-NEXT:    s_pack_ll_b32_b16 s2, 0x40a0, s2
45; GFX900-NEXT:    v_mov_b32_e32 v1, s2
46; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
47; GFX900-NEXT:    s_endpgm
48;
49; GFX940-LABEL: s_insertelement_v2bf16_0:
50; GFX940:       ; %bb.0:
51; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
52; GFX940-NEXT:    v_mov_b32_e32 v0, 0
53; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX940-NEXT:    s_load_dword s2, s[2:3], 0x0
55; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX940-NEXT:    s_lshr_b32 s2, s2, 16
57; GFX940-NEXT:    s_pack_ll_b32_b16 s2, 0x40a0, s2
58; GFX940-NEXT:    v_mov_b32_e32 v1, s2
59; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
60; GFX940-NEXT:    s_endpgm
61  %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
62  %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0
63  store <2 x bfloat> %vecins, ptr addrspace(1) %out
64  ret void
65}
66
67define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
68; SI-LABEL: s_insertelement_v2bf16_1:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
71; SI-NEXT:    s_waitcnt lgkmcnt(0)
72; SI-NEXT:    s_load_dword s4, s[2:3], 0x0
73; SI-NEXT:    s_mov_b32 s3, 0x100f000
74; SI-NEXT:    s_mov_b32 s2, -1
75; SI-NEXT:    s_waitcnt lgkmcnt(0)
76; SI-NEXT:    s_and_b32 s4, s4, 0xffff
77; SI-NEXT:    s_or_b32 s4, s4, 0x40a00000
78; SI-NEXT:    v_mov_b32_e32 v0, s4
79; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
80; SI-NEXT:    s_endpgm
81;
82; VI-LABEL: s_insertelement_v2bf16_1:
83; VI:       ; %bb.0:
84; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
85; VI-NEXT:    s_waitcnt lgkmcnt(0)
86; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
87; VI-NEXT:    v_mov_b32_e32 v0, s0
88; VI-NEXT:    v_mov_b32_e32 v1, s1
89; VI-NEXT:    s_waitcnt lgkmcnt(0)
90; VI-NEXT:    s_and_b32 s0, s2, 0xffff
91; VI-NEXT:    s_or_b32 s0, s0, 0x40a00000
92; VI-NEXT:    v_mov_b32_e32 v2, s0
93; VI-NEXT:    flat_store_dword v[0:1], v2
94; VI-NEXT:    s_endpgm
95;
96; GFX900-LABEL: s_insertelement_v2bf16_1:
97; GFX900:       ; %bb.0:
98; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
99; GFX900-NEXT:    v_mov_b32_e32 v0, 0
100; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX900-NEXT:    s_load_dword s2, s[2:3], 0x0
102; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX900-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x40a0
104; GFX900-NEXT:    v_mov_b32_e32 v1, s2
105; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
106; GFX900-NEXT:    s_endpgm
107;
108; GFX940-LABEL: s_insertelement_v2bf16_1:
109; GFX940:       ; %bb.0:
110; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
111; GFX940-NEXT:    v_mov_b32_e32 v0, 0
112; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX940-NEXT:    s_load_dword s2, s[2:3], 0x0
114; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX940-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x40a0
116; GFX940-NEXT:    v_mov_b32_e32 v1, s2
117; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
118; GFX940-NEXT:    s_endpgm
119  %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
120  %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1
121  store <2 x bfloat> %vecins, ptr addrspace(1) %out
122  ret void
123}
124
125define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
126; SI-LABEL: v_insertelement_v2bf16_0:
127; SI:       ; %bb.0:
128; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
129; SI-NEXT:    s_mov_b32 s7, 0x100f000
130; SI-NEXT:    s_mov_b32 s6, 0
131; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
132; SI-NEXT:    v_mov_b32_e32 v1, 0
133; SI-NEXT:    s_waitcnt lgkmcnt(0)
134; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
135; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
136; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
137; SI-NEXT:    s_waitcnt vmcnt(0)
138; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
139; SI-NEXT:    v_or_b32_e32 v2, 0x40a0, v2
140; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
141; SI-NEXT:    s_endpgm
142;
143; VI-LABEL: v_insertelement_v2bf16_0:
144; VI:       ; %bb.0:
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
146; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    v_mov_b32_e32 v1, s3
149; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
150; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
151; VI-NEXT:    flat_load_dword v3, v[0:1]
152; VI-NEXT:    v_mov_b32_e32 v1, s1
153; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
154; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
155; VI-NEXT:    s_waitcnt vmcnt(0)
156; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
157; VI-NEXT:    v_or_b32_e32 v2, 0x40a0, v2
158; VI-NEXT:    flat_store_dword v[0:1], v2
159; VI-NEXT:    s_endpgm
160;
161; GFX900-LABEL: v_insertelement_v2bf16_0:
162; GFX900:       ; %bb.0:
163; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
164; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
165; GFX900-NEXT:    v_mov_b32_e32 v2, 0x40a0
166; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX900-NEXT:    global_load_dword v1, v0, s[2:3]
168; GFX900-NEXT:    s_mov_b32 s2, 0xffff
169; GFX900-NEXT:    s_waitcnt vmcnt(0)
170; GFX900-NEXT:    v_bfi_b32 v1, s2, v2, v1
171; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
172; GFX900-NEXT:    s_endpgm
173;
174; GFX940-LABEL: v_insertelement_v2bf16_0:
175; GFX940:       ; %bb.0:
176; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
177; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
178; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
179; GFX940-NEXT:    v_mov_b32_e32 v2, 0x40a0
180; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX940-NEXT:    global_load_dword v1, v0, s[2:3]
182; GFX940-NEXT:    s_mov_b32 s2, 0xffff
183; GFX940-NEXT:    s_waitcnt vmcnt(0)
184; GFX940-NEXT:    v_bfi_b32 v1, s2, v2, v1
185; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
186; GFX940-NEXT:    s_endpgm
187  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
188  %tid.ext = sext i32 %tid to i64
189  %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
190  %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
191  %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
192  %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0
193  store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
194  ret void
195}
196
197define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
198; SI-LABEL: v_insertelement_v2bf16_0_inlineimm:
199; SI:       ; %bb.0:
200; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
201; SI-NEXT:    s_mov_b32 s7, 0x100f000
202; SI-NEXT:    s_mov_b32 s6, 0
203; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
204; SI-NEXT:    v_mov_b32_e32 v1, 0
205; SI-NEXT:    s_waitcnt lgkmcnt(0)
206; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
207; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
208; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
209; SI-NEXT:    s_waitcnt vmcnt(0)
210; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
211; SI-NEXT:    v_or_b32_e32 v2, 53, v2
212; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
213; SI-NEXT:    s_endpgm
214;
215; VI-LABEL: v_insertelement_v2bf16_0_inlineimm:
216; VI:       ; %bb.0:
217; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
218; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
219; VI-NEXT:    s_waitcnt lgkmcnt(0)
220; VI-NEXT:    v_mov_b32_e32 v1, s3
221; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
222; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
223; VI-NEXT:    flat_load_dword v3, v[0:1]
224; VI-NEXT:    v_mov_b32_e32 v1, s1
225; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
226; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
227; VI-NEXT:    s_waitcnt vmcnt(0)
228; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
229; VI-NEXT:    v_or_b32_e32 v2, 53, v2
230; VI-NEXT:    flat_store_dword v[0:1], v2
231; VI-NEXT:    s_endpgm
232;
233; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm:
234; GFX900:       ; %bb.0:
235; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
236; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
237; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX900-NEXT:    global_load_dword v1, v0, s[2:3]
239; GFX900-NEXT:    s_mov_b32 s2, 0xffff
240; GFX900-NEXT:    s_waitcnt vmcnt(0)
241; GFX900-NEXT:    v_bfi_b32 v1, s2, 53, v1
242; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
243; GFX900-NEXT:    s_endpgm
244;
245; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm:
246; GFX940:       ; %bb.0:
247; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
248; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
249; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX940-NEXT:    global_load_dword v1, v0, s[2:3]
252; GFX940-NEXT:    s_mov_b32 s2, 0xffff
253; GFX940-NEXT:    s_waitcnt vmcnt(0)
254; GFX940-NEXT:    v_bfi_b32 v1, s2, 53, v1
255; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
256; GFX940-NEXT:    s_endpgm
257  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
258  %tid.ext = sext i32 %tid to i64
259  %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
260  %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
261  %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
262  %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR0035, i32 0
263  store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
264  ret void
265}
266
267define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
268; SI-LABEL: v_insertelement_v2bf16_1:
269; SI:       ; %bb.0:
270; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
271; SI-NEXT:    s_mov_b32 s7, 0x100f000
272; SI-NEXT:    s_mov_b32 s6, 0
273; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
274; SI-NEXT:    v_mov_b32_e32 v1, 0
275; SI-NEXT:    s_waitcnt lgkmcnt(0)
276; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
277; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
278; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
279; SI-NEXT:    s_waitcnt vmcnt(0)
280; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
281; SI-NEXT:    v_or_b32_e32 v2, 0x40a00000, v2
282; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
283; SI-NEXT:    s_endpgm
284;
285; VI-LABEL: v_insertelement_v2bf16_1:
286; VI:       ; %bb.0:
287; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
288; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
289; VI-NEXT:    s_waitcnt lgkmcnt(0)
290; VI-NEXT:    v_mov_b32_e32 v1, s3
291; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
292; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
293; VI-NEXT:    flat_load_dword v3, v[0:1]
294; VI-NEXT:    v_mov_b32_e32 v1, s1
295; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
296; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
297; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
298; VI-NEXT:    s_waitcnt vmcnt(0)
299; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
300; VI-NEXT:    flat_store_dword v[0:1], v2
301; VI-NEXT:    s_endpgm
302;
303; GFX900-LABEL: v_insertelement_v2bf16_1:
304; GFX900:       ; %bb.0:
305; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
306; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
307; GFX900-NEXT:    v_mov_b32_e32 v2, 0x5040100
308; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX900-NEXT:    global_load_dword v1, v0, s[2:3]
310; GFX900-NEXT:    s_movk_i32 s2, 0x40a0
311; GFX900-NEXT:    s_waitcnt vmcnt(0)
312; GFX900-NEXT:    v_perm_b32 v1, s2, v1, v2
313; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
314; GFX900-NEXT:    s_endpgm
315;
316; GFX940-LABEL: v_insertelement_v2bf16_1:
317; GFX940:       ; %bb.0:
318; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
319; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
320; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
321; GFX940-NEXT:    v_mov_b32_e32 v2, 0x5040100
322; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX940-NEXT:    global_load_dword v1, v0, s[2:3]
324; GFX940-NEXT:    s_movk_i32 s2, 0x40a0
325; GFX940-NEXT:    s_waitcnt vmcnt(0)
326; GFX940-NEXT:    v_perm_b32 v1, s2, v1, v2
327; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
328; GFX940-NEXT:    s_endpgm
329  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
330  %tid.ext = sext i32 %tid to i64
331  %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
332  %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
333  %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
334  %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1
335  store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
336  ret void
337}
338
339define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
340; SI-LABEL: v_insertelement_v2bf16_1_inlineimm:
341; SI:       ; %bb.0:
342; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
343; SI-NEXT:    s_mov_b32 s7, 0x100f000
344; SI-NEXT:    s_mov_b32 s6, 0
345; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
346; SI-NEXT:    v_mov_b32_e32 v1, 0
347; SI-NEXT:    s_waitcnt lgkmcnt(0)
348; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
349; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
350; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
351; SI-NEXT:    s_waitcnt vmcnt(0)
352; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
353; SI-NEXT:    v_or_b32_e32 v2, 0x230000, v2
354; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
355; SI-NEXT:    s_endpgm
356;
357; VI-LABEL: v_insertelement_v2bf16_1_inlineimm:
358; VI:       ; %bb.0:
359; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
360; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    v_mov_b32_e32 v1, s3
363; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
364; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
365; VI-NEXT:    flat_load_dword v3, v[0:1]
366; VI-NEXT:    v_mov_b32_e32 v1, s1
367; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
368; VI-NEXT:    v_mov_b32_e32 v2, 0x230000
369; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
370; VI-NEXT:    s_waitcnt vmcnt(0)
371; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
372; VI-NEXT:    flat_store_dword v[0:1], v2
373; VI-NEXT:    s_endpgm
374;
375; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm:
376; GFX900:       ; %bb.0:
377; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
378; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
379; GFX900-NEXT:    v_mov_b32_e32 v2, 0x5040100
380; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX900-NEXT:    global_load_dword v1, v0, s[2:3]
382; GFX900-NEXT:    s_waitcnt vmcnt(0)
383; GFX900-NEXT:    v_perm_b32 v1, 35, v1, v2
384; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
385; GFX900-NEXT:    s_endpgm
386;
387; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm:
388; GFX940:       ; %bb.0:
389; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
390; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
391; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
392; GFX940-NEXT:    v_mov_b32_e32 v2, 0x5040100
393; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX940-NEXT:    global_load_dword v1, v0, s[2:3]
395; GFX940-NEXT:    s_waitcnt vmcnt(0)
396; GFX940-NEXT:    v_perm_b32 v1, 35, v1, v2
397; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
398; GFX940-NEXT:    s_endpgm
399  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
400  %tid.ext = sext i32 %tid to i64
401  %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
402  %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
403  %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
404  %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR0023, i32 1
405  store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
406  ret void
407}
408
409define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
410; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
411; SI:       ; %bb.0:
412; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
413; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
414; SI-NEXT:    s_mov_b32 s11, 0x100f000
415; SI-NEXT:    s_mov_b32 s10, 0
416; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
417; SI-NEXT:    v_mov_b32_e32 v1, 0
418; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
419; SI-NEXT:    s_waitcnt lgkmcnt(0)
420; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
421; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
422; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
423; SI-NEXT:    s_mov_b32 s4, 0x12341234
424; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
425; SI-NEXT:    s_waitcnt vmcnt(1)
426; SI-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
427; SI-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
428; SI-NEXT:    s_waitcnt vmcnt(0)
429; SI-NEXT:    v_bfi_b32 v2, v2, s4, v3
430; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
431; SI-NEXT:    s_endpgm
432;
433; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
434; VI:       ; %bb.0:
435; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
436; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
437; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
438; VI-NEXT:    s_waitcnt lgkmcnt(0)
439; VI-NEXT:    v_mov_b32_e32 v3, s3
440; VI-NEXT:    v_mov_b32_e32 v1, s5
441; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
442; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
443; VI-NEXT:    flat_load_dword v4, v[0:1]
444; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
445; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
446; VI-NEXT:    flat_load_dword v3, v[0:1]
447; VI-NEXT:    s_mov_b32 s2, 0xffff
448; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
449; VI-NEXT:    v_mov_b32_e32 v1, s1
450; VI-NEXT:    s_mov_b32 s0, 0x12341234
451; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
452; VI-NEXT:    s_waitcnt vmcnt(1)
453; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
454; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
455; VI-NEXT:    s_waitcnt vmcnt(0)
456; VI-NEXT:    v_bfi_b32 v2, v2, s0, v3
457; VI-NEXT:    flat_store_dword v[0:1], v2
458; VI-NEXT:    s_endpgm
459;
460; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
461; GFX900:       ; %bb.0:
462; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
463; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
464; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
465; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
466; GFX900-NEXT:    global_load_dword v1, v0, s[4:5]
467; GFX900-NEXT:    global_load_dword v2, v0, s[2:3]
468; GFX900-NEXT:    s_mov_b32 s2, 0xffff
469; GFX900-NEXT:    s_waitcnt vmcnt(1)
470; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
471; GFX900-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
472; GFX900-NEXT:    s_mov_b32 s2, 0x12341234
473; GFX900-NEXT:    s_waitcnt vmcnt(0)
474; GFX900-NEXT:    v_bfi_b32 v1, v1, s2, v2
475; GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
476; GFX900-NEXT:    s_endpgm
477;
478; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
479; GFX940:       ; %bb.0:
480; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
481; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
482; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
483; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
484; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX940-NEXT:    global_load_dword v1, v0, s[6:7]
486; GFX940-NEXT:    global_load_dword v2, v0, s[2:3]
487; GFX940-NEXT:    s_mov_b32 s2, 0xffff
488; GFX940-NEXT:    s_waitcnt vmcnt(1)
489; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
490; GFX940-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
491; GFX940-NEXT:    s_mov_b32 s2, 0x12341234
492; GFX940-NEXT:    s_waitcnt vmcnt(0)
493; GFX940-NEXT:    v_bfi_b32 v1, v1, s2, v2
494; GFX940-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
495; GFX940-NEXT:    s_endpgm
496  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
497  %tid.ext = sext i32 %tid to i64
498  %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
499  %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
500  %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
501  %idx = load i32, ptr addrspace(1) %idx.gep
502  %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
503  %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR1234, i32 %idx
504  store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
505  ret void
506}
507
508define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
509; SI-LABEL: v_insertelement_v4bf16_0:
510; SI:       ; %bb.0:
511; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
512; SI-NEXT:    s_mov_b32 s7, 0x100f000
513; SI-NEXT:    s_mov_b32 s6, 0
514; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
515; SI-NEXT:    v_mov_b32_e32 v1, 0
516; SI-NEXT:    s_waitcnt lgkmcnt(0)
517; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
518; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
519; SI-NEXT:    s_load_dword s8, s[8:9], 0xc
520; SI-NEXT:    s_mov_b32 s4, 0xffff
521; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
522; SI-NEXT:    s_waitcnt lgkmcnt(0)
523; SI-NEXT:    v_mov_b32_e32 v4, s8
524; SI-NEXT:    s_waitcnt vmcnt(0)
525; SI-NEXT:    v_bfi_b32 v2, s4, v4, v2
526; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
527; SI-NEXT:    s_endpgm
528;
529; VI-LABEL: v_insertelement_v4bf16_0:
530; VI:       ; %bb.0:
531; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
532; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
533; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
534; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
535; VI-NEXT:    s_waitcnt lgkmcnt(0)
536; VI-NEXT:    v_mov_b32_e32 v1, s3
537; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
538; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
539; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
540; VI-NEXT:    v_mov_b32_e32 v3, s1
541; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
542; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
543; VI-NEXT:    s_waitcnt vmcnt(0)
544; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
545; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
546; VI-NEXT:    s_endpgm
547;
548; GFX900-LABEL: v_insertelement_v4bf16_0:
549; GFX900:       ; %bb.0:
550; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
551; GFX900-NEXT:    s_load_dword s4, s[8:9], 0x30
552; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
553; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
555; GFX900-NEXT:    s_mov_b32 s2, 0xffff
556; GFX900-NEXT:    v_mov_b32_e32 v3, s4
557; GFX900-NEXT:    s_waitcnt vmcnt(0)
558; GFX900-NEXT:    v_bfi_b32 v0, s2, v3, v0
559; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
560; GFX900-NEXT:    s_endpgm
561;
562; GFX940-LABEL: v_insertelement_v4bf16_0:
563; GFX940:       ; %bb.0:
564; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
565; GFX940-NEXT:    s_load_dword s6, s[4:5], 0x30
566; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
567; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
568; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX940-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
570; GFX940-NEXT:    s_mov_b32 s2, 0xffff
571; GFX940-NEXT:    v_mov_b32_e32 v3, s6
572; GFX940-NEXT:    s_waitcnt vmcnt(0)
573; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
574; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
575; GFX940-NEXT:    s_endpgm
576  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
577  %tid.ext = sext i32 %tid to i64
578  %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
579  %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
580  %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
581  %val.trunc = trunc i32 %val to i16
582  %val.cvt = bitcast i16 %val.trunc to bfloat
583  %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 0
584  store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
585  ret void
586}
587
588define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
589; SI-LABEL: v_insertelement_v4bf16_1:
590; SI:       ; %bb.0:
591; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
592; SI-NEXT:    s_mov_b32 s7, 0x100f000
593; SI-NEXT:    s_mov_b32 s6, 0
594; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
595; SI-NEXT:    v_mov_b32_e32 v1, 0
596; SI-NEXT:    s_waitcnt lgkmcnt(0)
597; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
598; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
599; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
600; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
601; SI-NEXT:    s_waitcnt lgkmcnt(0)
602; SI-NEXT:    s_lshl_b32 s4, s8, 16
603; SI-NEXT:    s_waitcnt vmcnt(0)
604; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
605; SI-NEXT:    v_or_b32_e32 v2, s4, v2
606; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
607; SI-NEXT:    s_endpgm
608;
609; VI-LABEL: v_insertelement_v4bf16_1:
610; VI:       ; %bb.0:
611; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
612; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
613; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
614; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
615; VI-NEXT:    s_waitcnt lgkmcnt(0)
616; VI-NEXT:    v_mov_b32_e32 v1, s3
617; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
618; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
619; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
620; VI-NEXT:    v_mov_b32_e32 v3, s1
621; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
622; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
623; VI-NEXT:    s_waitcnt vmcnt(0)
624; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
625; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
626; VI-NEXT:    s_endpgm
627;
628; GFX900-LABEL: v_insertelement_v4bf16_1:
629; GFX900:       ; %bb.0:
630; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
631; GFX900-NEXT:    s_load_dword s4, s[8:9], 0x10
632; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
633; GFX900-NEXT:    v_mov_b32_e32 v3, 0x5040100
634; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
636; GFX900-NEXT:    s_waitcnt vmcnt(0)
637; GFX900-NEXT:    v_perm_b32 v0, s4, v0, v3
638; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
639; GFX900-NEXT:    s_endpgm
640;
641; GFX940-LABEL: v_insertelement_v4bf16_1:
642; GFX940:       ; %bb.0:
643; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
644; GFX940-NEXT:    s_load_dword s6, s[4:5], 0x10
645; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
646; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
647; GFX940-NEXT:    v_mov_b32_e32 v3, 0x5040100
648; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX940-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
650; GFX940-NEXT:    s_waitcnt vmcnt(0)
651; GFX940-NEXT:    v_perm_b32 v0, s6, v0, v3
652; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
653; GFX940-NEXT:    s_endpgm
654  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
655  %tid.ext = sext i32 %tid to i64
656  %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
657  %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
658  %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
659  %val.trunc = trunc i32 %val to i16
660  %val.cvt = bitcast i16 %val.trunc to bfloat
661  %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 1
662  store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
663  ret void
664}
665
666define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
667; SI-LABEL: v_insertelement_v4bf16_2:
668; SI:       ; %bb.0:
669; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
670; SI-NEXT:    s_mov_b32 s7, 0x100f000
671; SI-NEXT:    s_mov_b32 s6, 0
672; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
673; SI-NEXT:    v_mov_b32_e32 v1, 0
674; SI-NEXT:    s_waitcnt lgkmcnt(0)
675; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
676; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
677; SI-NEXT:    s_load_dword s8, s[8:9], 0xc
678; SI-NEXT:    s_mov_b32 s4, 0xffff
679; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
680; SI-NEXT:    s_waitcnt lgkmcnt(0)
681; SI-NEXT:    v_mov_b32_e32 v4, s8
682; SI-NEXT:    s_waitcnt vmcnt(0)
683; SI-NEXT:    v_bfi_b32 v3, s4, v4, v3
684; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
685; SI-NEXT:    s_endpgm
686;
687; VI-LABEL: v_insertelement_v4bf16_2:
688; VI:       ; %bb.0:
689; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
690; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
691; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
692; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
693; VI-NEXT:    s_waitcnt lgkmcnt(0)
694; VI-NEXT:    v_mov_b32_e32 v1, s3
695; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
696; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
697; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
698; VI-NEXT:    v_mov_b32_e32 v3, s1
699; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
700; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
701; VI-NEXT:    s_waitcnt vmcnt(0)
702; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
703; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
704; VI-NEXT:    s_endpgm
705;
706; GFX900-LABEL: v_insertelement_v4bf16_2:
707; GFX900:       ; %bb.0:
708; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
709; GFX900-NEXT:    s_load_dword s4, s[8:9], 0x30
710; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
711; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
712; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
713; GFX900-NEXT:    s_mov_b32 s2, 0xffff
714; GFX900-NEXT:    v_mov_b32_e32 v3, s4
715; GFX900-NEXT:    s_waitcnt vmcnt(0)
716; GFX900-NEXT:    v_bfi_b32 v1, s2, v3, v1
717; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
718; GFX900-NEXT:    s_endpgm
719;
720; GFX940-LABEL: v_insertelement_v4bf16_2:
721; GFX940:       ; %bb.0:
722; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
723; GFX940-NEXT:    s_load_dword s6, s[4:5], 0x30
724; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
725; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
726; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX940-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
728; GFX940-NEXT:    s_mov_b32 s2, 0xffff
729; GFX940-NEXT:    v_mov_b32_e32 v3, s6
730; GFX940-NEXT:    s_waitcnt vmcnt(0)
731; GFX940-NEXT:    v_bfi_b32 v1, s2, v3, v1
732; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
733; GFX940-NEXT:    s_endpgm
734  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
735  %tid.ext = sext i32 %tid to i64
736  %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
737  %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
738  %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
739  %val.trunc = trunc i32 %val to i16
740  %val.cvt = bitcast i16 %val.trunc to bfloat
741  %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 2
742  store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
743  ret void
744}
745
746define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
747; SI-LABEL: v_insertelement_v4bf16_3:
748; SI:       ; %bb.0:
749; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
750; SI-NEXT:    s_mov_b32 s7, 0x100f000
751; SI-NEXT:    s_mov_b32 s6, 0
752; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
753; SI-NEXT:    v_mov_b32_e32 v1, 0
754; SI-NEXT:    s_waitcnt lgkmcnt(0)
755; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
756; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
757; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
758; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
759; SI-NEXT:    s_waitcnt lgkmcnt(0)
760; SI-NEXT:    s_lshl_b32 s4, s8, 16
761; SI-NEXT:    s_waitcnt vmcnt(0)
762; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
763; SI-NEXT:    v_or_b32_e32 v3, s4, v3
764; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
765; SI-NEXT:    s_endpgm
766;
767; VI-LABEL: v_insertelement_v4bf16_3:
768; VI:       ; %bb.0:
769; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
770; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
771; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
772; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
773; VI-NEXT:    s_waitcnt lgkmcnt(0)
774; VI-NEXT:    v_mov_b32_e32 v1, s3
775; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
776; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
777; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
778; VI-NEXT:    v_mov_b32_e32 v3, s1
779; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
780; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
781; VI-NEXT:    s_waitcnt vmcnt(0)
782; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
783; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
784; VI-NEXT:    s_endpgm
785;
786; GFX900-LABEL: v_insertelement_v4bf16_3:
787; GFX900:       ; %bb.0:
788; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
789; GFX900-NEXT:    s_load_dword s4, s[8:9], 0x10
790; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
791; GFX900-NEXT:    v_mov_b32_e32 v3, 0x5040100
792; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
794; GFX900-NEXT:    s_waitcnt vmcnt(0)
795; GFX900-NEXT:    v_perm_b32 v1, s4, v1, v3
796; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
797; GFX900-NEXT:    s_endpgm
798;
799; GFX940-LABEL: v_insertelement_v4bf16_3:
800; GFX940:       ; %bb.0:
801; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
802; GFX940-NEXT:    s_load_dword s6, s[4:5], 0x10
803; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
804; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
805; GFX940-NEXT:    v_mov_b32_e32 v3, 0x5040100
806; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX940-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
808; GFX940-NEXT:    s_waitcnt vmcnt(0)
809; GFX940-NEXT:    v_perm_b32 v1, s6, v1, v3
810; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
811; GFX940-NEXT:    s_endpgm
812  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
813  %tid.ext = sext i32 %tid to i64
814  %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
815  %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
816  %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
817  %val.trunc = trunc i32 %val to i16
818  %val.cvt = bitcast i16 %val.trunc to bfloat
819  %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 3
820  store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
821  ret void
822}
823
824define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 {
825; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
826; SI:       ; %bb.0:
827; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
828; SI-NEXT:    s_mov_b32 s7, 0x100f000
829; SI-NEXT:    s_mov_b32 s6, 0
830; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
831; SI-NEXT:    v_mov_b32_e32 v1, 0
832; SI-NEXT:    s_waitcnt lgkmcnt(0)
833; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
834; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
835; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x4
836; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
837; SI-NEXT:    s_waitcnt lgkmcnt(0)
838; SI-NEXT:    s_lshl_b32 s4, s8, 16
839; SI-NEXT:    s_and_b32 s5, s8, 0xffff
840; SI-NEXT:    s_lshl_b32 s6, s9, 4
841; SI-NEXT:    s_or_b32 s7, s5, s4
842; SI-NEXT:    s_lshl_b64 s[4:5], 0xffff, s6
843; SI-NEXT:    v_mov_b32_e32 v4, s7
844; SI-NEXT:    v_mov_b32_e32 v5, s7
845; SI-NEXT:    s_waitcnt vmcnt(0)
846; SI-NEXT:    v_bfi_b32 v3, s5, v4, v3
847; SI-NEXT:    v_bfi_b32 v2, s4, v5, v2
848; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
849; SI-NEXT:    s_endpgm
850;
851; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
852; VI:       ; %bb.0:
853; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
854; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
855; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
856; VI-NEXT:    s_waitcnt lgkmcnt(0)
857; VI-NEXT:    v_mov_b32_e32 v1, s3
858; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
859; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
860; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
861; VI-NEXT:    v_mov_b32_e32 v3, s1
862; VI-NEXT:    s_lshl_b32 s1, s4, 16
863; VI-NEXT:    s_and_b32 s2, s4, 0xffff
864; VI-NEXT:    s_lshl_b32 s3, s5, 4
865; VI-NEXT:    s_or_b32 s2, s2, s1
866; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
867; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
868; VI-NEXT:    v_mov_b32_e32 v4, s2
869; VI-NEXT:    v_mov_b32_e32 v5, s2
870; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
871; VI-NEXT:    s_waitcnt vmcnt(0)
872; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
873; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
874; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
875; VI-NEXT:    s_endpgm
876;
877; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
878; GFX900:       ; %bb.0:
879; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
880; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
881; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
882; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
884; GFX900-NEXT:    s_lshl_b32 s2, s5, 4
885; GFX900-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
886; GFX900-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
887; GFX900-NEXT:    v_mov_b32_e32 v3, s4
888; GFX900-NEXT:    v_mov_b32_e32 v4, s4
889; GFX900-NEXT:    s_waitcnt vmcnt(0)
890; GFX900-NEXT:    v_bfi_b32 v1, s3, v3, v1
891; GFX900-NEXT:    v_bfi_b32 v0, s2, v4, v0
892; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
893; GFX900-NEXT:    s_endpgm
894;
895; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
896; GFX940:       ; %bb.0:
897; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
898; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
899; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
900; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
901; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX940-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
903; GFX940-NEXT:    s_lshl_b32 s2, s7, 4
904; GFX940-NEXT:    s_pack_ll_b32_b16 s4, s6, s6
905; GFX940-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
906; GFX940-NEXT:    v_mov_b32_e32 v3, s4
907; GFX940-NEXT:    v_mov_b32_e32 v4, s4
908; GFX940-NEXT:    s_waitcnt vmcnt(0)
909; GFX940-NEXT:    v_bfi_b32 v1, s3, v3, v1
910; GFX940-NEXT:    v_bfi_b32 v0, s2, v4, v0
911; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
912; GFX940-NEXT:    s_endpgm
913  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
914  %tid.ext = sext i32 %tid to i64
915  %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
916  %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
917  %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
918  %val.trunc = trunc i32 %val to i16
919  %val.cvt = bitcast i16 %val.trunc to bfloat
920  %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 %idxval
921  store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
922  ret void
923}
924
925define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
926; SI-LABEL: v_insertelement_v8bf16_3:
927; SI:       ; %bb.0:
928; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
929; SI-NEXT:    s_mov_b32 s7, 0x100f000
930; SI-NEXT:    s_mov_b32 s6, 0
931; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
932; SI-NEXT:    v_mov_b32_e32 v5, 0
933; SI-NEXT:    s_waitcnt lgkmcnt(0)
934; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
935; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
936; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
937; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
938; SI-NEXT:    s_waitcnt lgkmcnt(0)
939; SI-NEXT:    s_lshl_b32 s4, s8, 16
940; SI-NEXT:    s_waitcnt vmcnt(0)
941; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
942; SI-NEXT:    v_or_b32_e32 v1, s4, v1
943; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
944; SI-NEXT:    s_endpgm
945;
946; VI-LABEL: v_insertelement_v8bf16_3:
947; VI:       ; %bb.0:
948; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
949; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
950; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
951; VI-NEXT:    s_waitcnt lgkmcnt(0)
952; VI-NEXT:    v_mov_b32_e32 v1, s3
953; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
954; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
955; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
956; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
957; VI-NEXT:    s_lshl_b32 s0, s4, 16
958; VI-NEXT:    v_mov_b32_e32 v5, s1
959; VI-NEXT:    v_mov_b32_e32 v6, s0
960; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
961; VI-NEXT:    s_waitcnt vmcnt(0)
962; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
963; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
964; VI-NEXT:    s_endpgm
965;
966; GFX900-LABEL: v_insertelement_v8bf16_3:
967; GFX900:       ; %bb.0:
968; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
969; GFX900-NEXT:    s_load_dword s4, s[8:9], 0x10
970; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
971; GFX900-NEXT:    v_mov_b32_e32 v5, 0x5040100
972; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX900-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
974; GFX900-NEXT:    s_waitcnt vmcnt(0)
975; GFX900-NEXT:    v_perm_b32 v1, s4, v1, v5
976; GFX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
977; GFX900-NEXT:    s_endpgm
978;
979; GFX940-LABEL: v_insertelement_v8bf16_3:
980; GFX940:       ; %bb.0:
981; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
982; GFX940-NEXT:    s_load_dword s6, s[4:5], 0x10
983; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
984; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
985; GFX940-NEXT:    v_mov_b32_e32 v5, 0x5040100
986; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX940-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
988; GFX940-NEXT:    s_waitcnt vmcnt(0)
989; GFX940-NEXT:    v_perm_b32 v1, s6, v1, v5
990; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
991; GFX940-NEXT:    s_endpgm
992  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
993  %tid.ext = sext i32 %tid to i64
994  %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
995  %out.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
996  %vec = load <8 x bfloat>, ptr addrspace(1) %in.gep
997  %val.trunc = trunc i32 %val to i16
998  %val.cvt = bitcast i16 %val.trunc to bfloat
999  %vecins = insertelement <8 x bfloat> %vec, bfloat %val.cvt, i32 3
1000  store <8 x bfloat> %vecins, ptr addrspace(1) %out.gep
1001  ret void
1002}
1003
1004define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
1005; SI-LABEL: v_insertelement_v8bf16_dynamic:
1006; SI:       ; %bb.0:
1007; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1008; SI-NEXT:    s_mov_b32 s7, 0x100f000
1009; SI-NEXT:    s_mov_b32 s6, 0
1010; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1011; SI-NEXT:    v_mov_b32_e32 v5, 0
1012; SI-NEXT:    s_waitcnt lgkmcnt(0)
1013; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1014; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
1015; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x4
1016; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1017; SI-NEXT:    s_waitcnt lgkmcnt(0)
1018; SI-NEXT:    s_cmp_eq_u32 s9, 6
1019; SI-NEXT:    v_mov_b32_e32 v6, s8
1020; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1021; SI-NEXT:    s_cmp_eq_u32 s9, 7
1022; SI-NEXT:    s_waitcnt vmcnt(0)
1023; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
1024; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1025; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1026; SI-NEXT:    s_cmp_eq_u32 s9, 4
1027; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1028; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1029; SI-NEXT:    s_cmp_eq_u32 s9, 5
1030; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
1031; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1032; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1033; SI-NEXT:    s_cmp_eq_u32 s9, 2
1034; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
1035; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1036; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
1037; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1038; SI-NEXT:    s_cmp_eq_u32 s9, 3
1039; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1040; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1041; SI-NEXT:    v_or_b32_e32 v3, v7, v3
1042; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
1043; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1044; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1045; SI-NEXT:    s_cmp_eq_u32 s9, 0
1046; SI-NEXT:    v_or_b32_e32 v2, v2, v7
1047; SI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
1048; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1049; SI-NEXT:    s_cmp_eq_u32 s9, 1
1050; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1051; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1052; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1053; SI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
1054; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1055; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1056; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1057; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1058; SI-NEXT:    v_or_b32_e32 v1, v1, v7
1059; SI-NEXT:    v_or_b32_e32 v0, v0, v6
1060; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1061; SI-NEXT:    s_endpgm
1062;
1063; VI-LABEL: v_insertelement_v8bf16_dynamic:
1064; VI:       ; %bb.0:
1065; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1066; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1067; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1068; VI-NEXT:    s_waitcnt lgkmcnt(0)
1069; VI-NEXT:    v_mov_b32_e32 v1, s3
1070; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1071; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1072; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1073; VI-NEXT:    v_mov_b32_e32 v5, s1
1074; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1075; VI-NEXT:    s_cmp_eq_u32 s5, 6
1076; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1077; VI-NEXT:    v_mov_b32_e32 v6, s4
1078; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1079; VI-NEXT:    s_cmp_eq_u32 s5, 7
1080; VI-NEXT:    s_waitcnt vmcnt(0)
1081; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
1082; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1083; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1084; VI-NEXT:    s_cmp_eq_u32 s5, 4
1085; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1086; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1087; VI-NEXT:    s_cmp_eq_u32 s5, 5
1088; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
1089; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1090; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1091; VI-NEXT:    s_cmp_eq_u32 s5, 2
1092; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1093; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
1094; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1095; VI-NEXT:    s_cmp_eq_u32 s5, 3
1096; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1097; VI-NEXT:    v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1098; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
1099; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1100; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1101; VI-NEXT:    s_cmp_eq_u32 s5, 0
1102; VI-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1103; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
1104; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1105; VI-NEXT:    s_cmp_eq_u32 s5, 1
1106; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1107; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1108; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1109; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
1110; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1111; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1112; VI-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1113; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1114; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1115; VI-NEXT:    s_endpgm
1116;
1117; GFX900-LABEL: v_insertelement_v8bf16_dynamic:
1118; GFX900:       ; %bb.0:
1119; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1120; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1121; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1122; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX900-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
1124; GFX900-NEXT:    s_cmp_eq_u32 s5, 6
1125; GFX900-NEXT:    v_mov_b32_e32 v5, s4
1126; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1127; GFX900-NEXT:    s_cmp_eq_u32 s5, 7
1128; GFX900-NEXT:    s_mov_b32 s2, 0x5040100
1129; GFX900-NEXT:    s_waitcnt vmcnt(0)
1130; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v5, vcc
1131; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1132; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1133; GFX900-NEXT:    s_cmp_eq_u32 s5, 4
1134; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1135; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1136; GFX900-NEXT:    s_cmp_eq_u32 s5, 5
1137; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
1138; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1139; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1140; GFX900-NEXT:    s_cmp_eq_u32 s5, 2
1141; GFX900-NEXT:    v_perm_b32 v3, v3, v6, s2
1142; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v5, vcc
1143; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1144; GFX900-NEXT:    s_cmp_eq_u32 s5, 3
1145; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1146; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1147; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1148; GFX900-NEXT:    s_cmp_eq_u32 s5, 0
1149; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s2
1150; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
1151; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1152; GFX900-NEXT:    s_cmp_eq_u32 s5, 1
1153; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
1154; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
1155; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1156; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
1157; GFX900-NEXT:    v_perm_b32 v1, v6, v1, s2
1158; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s2
1159; GFX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1160; GFX900-NEXT:    s_endpgm
1161;
1162; GFX940-LABEL: v_insertelement_v8bf16_dynamic:
1163; GFX940:       ; %bb.0:
1164; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1165; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1166; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1167; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1168; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1169; GFX940-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
1170; GFX940-NEXT:    s_cmp_eq_u32 s7, 6
1171; GFX940-NEXT:    v_mov_b32_e32 v5, s6
1172; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1173; GFX940-NEXT:    s_cmp_eq_u32 s7, 7
1174; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1175; GFX940-NEXT:    s_waitcnt vmcnt(0)
1176; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v5, vcc
1177; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1178; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1179; GFX940-NEXT:    s_cmp_eq_u32 s7, 4
1180; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1181; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1182; GFX940-NEXT:    s_cmp_eq_u32 s7, 5
1183; GFX940-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
1184; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1185; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1186; GFX940-NEXT:    s_cmp_eq_u32 s7, 2
1187; GFX940-NEXT:    v_perm_b32 v3, v3, v6, s2
1188; GFX940-NEXT:    v_cndmask_b32_e32 v6, v7, v5, vcc
1189; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1190; GFX940-NEXT:    s_cmp_eq_u32 s7, 3
1191; GFX940-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1192; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1193; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1194; GFX940-NEXT:    s_cmp_eq_u32 s7, 0
1195; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s2
1196; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
1197; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1198; GFX940-NEXT:    s_cmp_eq_u32 s7, 1
1199; GFX940-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
1200; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
1201; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1202; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
1203; GFX940-NEXT:    v_perm_b32 v1, v6, v1, s2
1204; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s2
1205; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
1206; GFX940-NEXT:    s_endpgm
1207  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1208  %tid.ext = sext i32 %tid to i64
1209  %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
1210  %out.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
1211  %vec = load <8 x bfloat>, ptr addrspace(1) %in.gep
1212  %val.trunc = trunc i32 %val to i16
1213  %val.cvt = bitcast i16 %val.trunc to bfloat
1214  %vecins = insertelement <8 x bfloat> %vec, bfloat %val.cvt, i32 %n
1215  store <8 x bfloat> %vecins, ptr addrspace(1) %out.gep
1216  ret void
1217}
1218
1219define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
1220; SI-LABEL: v_insertelement_v16bf16_3:
1221; SI:       ; %bb.0:
1222; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1223; SI-NEXT:    s_mov_b32 s7, 0x100f000
1224; SI-NEXT:    s_mov_b32 s6, 0
1225; SI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
1226; SI-NEXT:    v_mov_b32_e32 v9, 0
1227; SI-NEXT:    s_waitcnt lgkmcnt(0)
1228; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1229; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1230; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16
1231; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
1232; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1233; SI-NEXT:    s_waitcnt lgkmcnt(0)
1234; SI-NEXT:    s_lshl_b32 s4, s8, 16
1235; SI-NEXT:    s_waitcnt vmcnt(1)
1236; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1237; SI-NEXT:    v_or_b32_e32 v1, s4, v1
1238; SI-NEXT:    s_waitcnt vmcnt(0)
1239; SI-NEXT:    buffer_store_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 offset:16
1240; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
1241; SI-NEXT:    s_endpgm
1242;
1243; VI-LABEL: v_insertelement_v16bf16_3:
1244; VI:       ; %bb.0:
1245; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1246; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1247; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
1248; VI-NEXT:    s_waitcnt lgkmcnt(0)
1249; VI-NEXT:    v_mov_b32_e32 v1, s3
1250; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
1251; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1252; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
1253; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1254; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1255; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1256; VI-NEXT:    v_mov_b32_e32 v9, s1
1257; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
1258; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
1259; VI-NEXT:    s_lshl_b32 s1, s4, 16
1260; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
1261; VI-NEXT:    v_mov_b32_e32 v12, s1
1262; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
1263; VI-NEXT:    s_waitcnt vmcnt(1)
1264; VI-NEXT:    v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1265; VI-NEXT:    s_waitcnt vmcnt(0)
1266; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1267; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1268; VI-NEXT:    s_endpgm
1269;
1270; GFX900-LABEL: v_insertelement_v16bf16_3:
1271; GFX900:       ; %bb.0:
1272; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1273; GFX900-NEXT:    s_load_dword s4, s[8:9], 0x10
1274; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
1275; GFX900-NEXT:    v_mov_b32_e32 v9, 0x5040100
1276; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX900-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
1278; GFX900-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
1279; GFX900-NEXT:    s_waitcnt vmcnt(1)
1280; GFX900-NEXT:    v_perm_b32 v1, s4, v1, v9
1281; GFX900-NEXT:    s_waitcnt vmcnt(0)
1282; GFX900-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
1283; GFX900-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
1284; GFX900-NEXT:    s_endpgm
1285;
1286; GFX940-LABEL: v_insertelement_v16bf16_3:
1287; GFX940:       ; %bb.0:
1288; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1289; GFX940-NEXT:    s_load_dword s6, s[4:5], 0x10
1290; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1291; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
1292; GFX940-NEXT:    v_mov_b32_e32 v9, 0x5040100
1293; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX940-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
1295; GFX940-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
1296; GFX940-NEXT:    s_waitcnt vmcnt(1)
1297; GFX940-NEXT:    v_perm_b32 v1, s6, v1, v9
1298; GFX940-NEXT:    s_waitcnt vmcnt(0)
1299; GFX940-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1
1300; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
1301; GFX940-NEXT:    s_endpgm
1302  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1303  %tid.ext = sext i32 %tid to i64
1304  %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
1305  %out.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
1306  %vec = load <16 x bfloat>, ptr addrspace(1) %in.gep
1307  %val.trunc = trunc i32 %val to i16
1308  %val.cvt = bitcast i16 %val.trunc to bfloat
1309  %vecins = insertelement <16 x bfloat> %vec, bfloat %val.cvt, i32 3
1310  store <16 x bfloat> %vecins, ptr addrspace(1) %out.gep
1311  ret void
1312}
1313
1314define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
1315; SI-LABEL: v_insertelement_v16bf16_dynamic:
1316; SI:       ; %bb.0:
1317; SI-NEXT:    s_load_dwordx4 s[12:15], s[8:9], 0x0
1318; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x4
1319; SI-NEXT:    s_mov_b32 s3, 0x100f000
1320; SI-NEXT:    s_mov_b32 s2, 0
1321; SI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
1322; SI-NEXT:    s_waitcnt lgkmcnt(0)
1323; SI-NEXT:    s_mov_b64 s[0:1], s[14:15]
1324; SI-NEXT:    v_mov_b32_e32 v5, 0
1325; SI-NEXT:    buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64
1326; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16
1327; SI-NEXT:    s_cmp_eq_u32 s7, 6
1328; SI-NEXT:    v_mov_b32_e32 v6, s6
1329; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1330; SI-NEXT:    s_cmp_eq_u32 s7, 7
1331; SI-NEXT:    s_mov_b64 s[14:15], s[2:3]
1332; SI-NEXT:    s_waitcnt vmcnt(1)
1333; SI-NEXT:    v_cndmask_b32_e32 v11, v10, v6, vcc
1334; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1335; SI-NEXT:    s_cmp_eq_u32 s7, 4
1336; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1337; SI-NEXT:    s_cmp_eq_u32 s7, 5
1338; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
1339; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
1340; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
1341; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1342; SI-NEXT:    s_cmp_eq_u32 s7, 2
1343; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1344; SI-NEXT:    s_cmp_eq_u32 s7, 3
1345; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v6, vcc
1346; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
1347; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[2:3]
1348; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1349; SI-NEXT:    s_cmp_eq_u32 s7, 0
1350; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
1351; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
1352; SI-NEXT:    v_cndmask_b32_e64 v12, v12, v6, s[0:1]
1353; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1354; SI-NEXT:    v_or_b32_e32 v10, v11, v10
1355; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
1356; SI-NEXT:    v_cndmask_b32_e64 v12, v13, v6, s[2:3]
1357; SI-NEXT:    s_cmp_eq_u32 s7, 1
1358; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
1359; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
1360; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
1361; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1362; SI-NEXT:    s_cmp_eq_u32 s7, 14
1363; SI-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[4:5]
1364; SI-NEXT:    v_or_b32_e32 v8, v8, v12
1365; SI-NEXT:    v_cndmask_b32_e32 v12, v14, v6, vcc
1366; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1367; SI-NEXT:    s_cmp_eq_u32 s7, 15
1368; SI-NEXT:    s_waitcnt vmcnt(0)
1369; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
1370; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
1371; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
1372; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1373; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1374; SI-NEXT:    s_cmp_eq_u32 s7, 12
1375; SI-NEXT:    v_or_b32_e32 v7, v7, v12
1376; SI-NEXT:    v_cndmask_b32_e32 v12, v15, v6, vcc
1377; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1378; SI-NEXT:    s_cmp_eq_u32 s7, 13
1379; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
1380; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1381; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
1382; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1383; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1384; SI-NEXT:    s_cmp_eq_u32 s7, 10
1385; SI-NEXT:    v_or_b32_e32 v3, v3, v12
1386; SI-NEXT:    v_cndmask_b32_e32 v12, v16, v6, vcc
1387; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1388; SI-NEXT:    s_cmp_eq_u32 s7, 11
1389; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
1390; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1391; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
1392; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1393; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1394; SI-NEXT:    s_cmp_eq_u32 s7, 8
1395; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
1396; SI-NEXT:    v_or_b32_e32 v2, v2, v12
1397; SI-NEXT:    v_cndmask_b32_e32 v12, v17, v6, vcc
1398; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1399; SI-NEXT:    s_cmp_eq_u32 s7, 9
1400; SI-NEXT:    v_or_b32_e32 v9, v9, v11
1401; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
1402; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1403; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1404; SI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
1405; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1406; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
1407; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1408; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1409; SI-NEXT:    v_or_b32_e32 v1, v1, v12
1410; SI-NEXT:    v_or_b32_e32 v0, v0, v6
1411; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16
1412; SI-NEXT:    buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64
1413; SI-NEXT:    s_endpgm
1414;
1415; VI-LABEL: v_insertelement_v16bf16_dynamic:
1416; VI:       ; %bb.0:
1417; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1418; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x10
1419; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
1420; VI-NEXT:    s_waitcnt lgkmcnt(0)
1421; VI-NEXT:    v_mov_b32_e32 v0, s3
1422; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
1423; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
1424; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
1425; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
1426; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1427; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1428; VI-NEXT:    v_mov_b32_e32 v9, s1
1429; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
1430; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
1431; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
1432; VI-NEXT:    s_cmp_eq_u32 s7, 14
1433; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
1434; VI-NEXT:    v_mov_b32_e32 v12, s6
1435; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1436; VI-NEXT:    s_cmp_eq_u32 s7, 15
1437; VI-NEXT:    s_waitcnt vmcnt(1)
1438; VI-NEXT:    v_cndmask_b32_e32 v13, v3, v12, vcc
1439; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1440; VI-NEXT:    s_cmp_eq_u32 s7, 12
1441; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1442; VI-NEXT:    s_cmp_eq_u32 s7, 13
1443; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
1444; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v12, s[0:1]
1445; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1446; VI-NEXT:    s_cmp_eq_u32 s7, 10
1447; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1448; VI-NEXT:    s_cmp_eq_u32 s7, 11
1449; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
1450; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
1451; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1452; VI-NEXT:    s_cmp_eq_u32 s7, 8
1453; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1454; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1455; VI-NEXT:    v_cndmask_b32_e64 v15, v15, v12, s[2:3]
1456; VI-NEXT:    s_cmp_eq_u32 s7, 9
1457; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
1458; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
1459; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
1460; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1461; VI-NEXT:    s_cmp_eq_u32 s7, 6
1462; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1463; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v12, vcc
1464; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1465; VI-NEXT:    s_cmp_eq_u32 s7, 7
1466; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[4:5]
1467; VI-NEXT:    s_waitcnt vmcnt(0)
1468; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
1469; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1470; VI-NEXT:    v_cndmask_b32_e64 v14, v14, v12, s[0:1]
1471; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
1472; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
1473; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1474; VI-NEXT:    s_cmp_eq_u32 s7, 4
1475; VI-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1476; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
1477; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1478; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v12, vcc
1479; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1480; VI-NEXT:    s_cmp_eq_u32 s7, 5
1481; VI-NEXT:    v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1482; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
1483; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
1484; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1485; VI-NEXT:    s_cmp_eq_u32 s7, 2
1486; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
1487; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1488; VI-NEXT:    s_cmp_eq_u32 s7, 3
1489; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
1490; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
1491; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1492; VI-NEXT:    s_cmp_eq_u32 s7, 0
1493; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
1494; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v12, vcc
1495; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1496; VI-NEXT:    s_cmp_eq_u32 s7, 1
1497; VI-NEXT:    v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1498; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
1499; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
1500; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1501; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
1502; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
1503; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
1504; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
1505; VI-NEXT:    v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1506; VI-NEXT:    v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1507; VI-NEXT:    v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1508; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
1509; VI-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
1510; VI-NEXT:    s_endpgm
1511;
1512; GFX900-LABEL: v_insertelement_v16bf16_dynamic:
1513; GFX900:       ; %bb.0:
1514; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1515; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1516; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
1517; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
1518; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
1519; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
1520; GFX900-NEXT:    s_cmp_eq_u32 s5, 6
1521; GFX900-NEXT:    v_mov_b32_e32 v9, s4
1522; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1523; GFX900-NEXT:    s_cmp_eq_u32 s5, 7
1524; GFX900-NEXT:    s_mov_b32 s2, 0x5040100
1525; GFX900-NEXT:    s_waitcnt vmcnt(1)
1526; GFX900-NEXT:    v_cndmask_b32_e32 v10, v4, v9, vcc
1527; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
1528; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1529; GFX900-NEXT:    s_cmp_eq_u32 s5, 4
1530; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
1531; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1532; GFX900-NEXT:    s_cmp_eq_u32 s5, 5
1533; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
1534; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
1535; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1536; GFX900-NEXT:    s_cmp_eq_u32 s5, 2
1537; GFX900-NEXT:    v_perm_b32 v4, v4, v10, s2
1538; GFX900-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc
1539; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1540; GFX900-NEXT:    s_cmp_eq_u32 s5, 3
1541; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
1542; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
1543; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1544; GFX900-NEXT:    s_cmp_eq_u32 s5, 0
1545; GFX900-NEXT:    v_cndmask_b32_e32 v11, v12, v9, vcc
1546; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1547; GFX900-NEXT:    s_cmp_eq_u32 s5, 1
1548; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
1549; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
1550; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1551; GFX900-NEXT:    s_cmp_eq_u32 s5, 14
1552; GFX900-NEXT:    v_cndmask_b32_e32 v12, v13, v9, vcc
1553; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1554; GFX900-NEXT:    s_cmp_eq_u32 s5, 15
1555; GFX900-NEXT:    s_waitcnt vmcnt(0)
1556; GFX900-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
1557; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
1558; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1559; GFX900-NEXT:    s_cmp_eq_u32 s5, 12
1560; GFX900-NEXT:    v_perm_b32 v1, v12, v1, s2
1561; GFX900-NEXT:    v_cndmask_b32_e32 v12, v14, v9, vcc
1562; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1563; GFX900-NEXT:    s_cmp_eq_u32 s5, 13
1564; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
1565; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1566; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1567; GFX900-NEXT:    s_cmp_eq_u32 s5, 10
1568; GFX900-NEXT:    v_perm_b32 v8, v12, v8, s2
1569; GFX900-NEXT:    v_cndmask_b32_e32 v12, v15, v9, vcc
1570; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1571; GFX900-NEXT:    s_cmp_eq_u32 s5, 11
1572; GFX900-NEXT:    v_perm_b32 v3, v10, v3, s2
1573; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
1574; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
1575; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1576; GFX900-NEXT:    s_cmp_eq_u32 s5, 8
1577; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
1578; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1579; GFX900-NEXT:    s_cmp_eq_u32 s5, 9
1580; GFX900-NEXT:    v_perm_b32 v2, v11, v2, s2
1581; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
1582; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
1583; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
1584; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
1585; GFX900-NEXT:    v_perm_b32 v7, v12, v7, s2
1586; GFX900-NEXT:    v_perm_b32 v6, v10, v6, s2
1587; GFX900-NEXT:    v_perm_b32 v5, v9, v5, s2
1588; GFX900-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
1589; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
1590; GFX900-NEXT:    s_endpgm
1591;
1592; GFX940-LABEL: v_insertelement_v16bf16_dynamic:
1593; GFX940:       ; %bb.0:
1594; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1595; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1596; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1597; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
1598; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1599; GFX940-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
1600; GFX940-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
1601; GFX940-NEXT:    s_cmp_eq_u32 s7, 6
1602; GFX940-NEXT:    v_mov_b32_e32 v9, s6
1603; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1604; GFX940-NEXT:    s_cmp_eq_u32 s7, 7
1605; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1606; GFX940-NEXT:    s_waitcnt vmcnt(1)
1607; GFX940-NEXT:    v_cndmask_b32_e32 v10, v3, v9, vcc
1608; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1609; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1610; GFX940-NEXT:    s_cmp_eq_u32 s7, 4
1611; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
1612; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1613; GFX940-NEXT:    s_cmp_eq_u32 s7, 5
1614; GFX940-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
1615; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
1616; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1617; GFX940-NEXT:    s_cmp_eq_u32 s7, 2
1618; GFX940-NEXT:    v_perm_b32 v3, v3, v10, s2
1619; GFX940-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc
1620; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1621; GFX940-NEXT:    s_cmp_eq_u32 s7, 3
1622; GFX940-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
1623; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
1624; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1625; GFX940-NEXT:    s_cmp_eq_u32 s7, 0
1626; GFX940-NEXT:    v_perm_b32 v2, v10, v2, s2
1627; GFX940-NEXT:    v_cndmask_b32_e32 v10, v12, v9, vcc
1628; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1629; GFX940-NEXT:    s_cmp_eq_u32 s7, 1
1630; GFX940-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
1631; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
1632; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1633; GFX940-NEXT:    s_cmp_eq_u32 s7, 14
1634; GFX940-NEXT:    v_perm_b32 v1, v10, v1, s2
1635; GFX940-NEXT:    v_cndmask_b32_e32 v10, v13, v9, vcc
1636; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1637; GFX940-NEXT:    s_cmp_eq_u32 s7, 15
1638; GFX940-NEXT:    s_waitcnt vmcnt(0)
1639; GFX940-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
1640; GFX940-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1641; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1642; GFX940-NEXT:    s_cmp_eq_u32 s7, 12
1643; GFX940-NEXT:    v_perm_b32 v0, v10, v0, s2
1644; GFX940-NEXT:    v_cndmask_b32_e32 v10, v14, v9, vcc
1645; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1646; GFX940-NEXT:    s_cmp_eq_u32 s7, 13
1647; GFX940-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
1648; GFX940-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
1649; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1650; GFX940-NEXT:    s_cmp_eq_u32 s7, 10
1651; GFX940-NEXT:    v_perm_b32 v7, v10, v7, s2
1652; GFX940-NEXT:    v_cndmask_b32_e32 v10, v15, v9, vcc
1653; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1654; GFX940-NEXT:    s_cmp_eq_u32 s7, 11
1655; GFX940-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
1656; GFX940-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
1657; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1658; GFX940-NEXT:    s_cmp_eq_u32 s7, 8
1659; GFX940-NEXT:    v_perm_b32 v6, v10, v6, s2
1660; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v9, vcc
1661; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1662; GFX940-NEXT:    s_cmp_eq_u32 s7, 9
1663; GFX940-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
1664; GFX940-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
1665; GFX940-NEXT:    s_cselect_b64 vcc, -1, 0
1666; GFX940-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
1667; GFX940-NEXT:    v_perm_b32 v5, v10, v5, s2
1668; GFX940-NEXT:    v_perm_b32 v4, v9, v4, s2
1669; GFX940-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1
1670; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
1671; GFX940-NEXT:    s_endpgm
1672  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1673  %tid.ext = sext i32 %tid to i64
1674  %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
1675  %out.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
1676  %vec = load <16 x bfloat>, ptr addrspace(1) %in.gep
1677  %val.trunc = trunc i32 %val to i16
1678  %val.cvt = bitcast i16 %val.trunc to bfloat
1679  %vecins = insertelement <16 x bfloat> %vec, bfloat %val.cvt, i32 %n
1680  store <16 x bfloat> %vecins, ptr addrspace(1) %out.gep
1681  ret void
1682}
1683
1684declare i32 @llvm.amdgcn.workitem.id.x() #1
1685
1686attributes #0 = { nounwind }
1687attributes #1 = { nounwind readnone }
1688