xref: /llvm-project/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
6
7define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
8; GCN-LABEL: uniform_vec_0_i16:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
11; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
12; GCN-NEXT:    s_mov_b32 s3, 0xf000
13; GCN-NEXT:    s_waitcnt lgkmcnt(0)
14; GCN-NEXT:    s_lshl_b32 s4, s2, 16
15; GCN-NEXT:    s_mov_b32 s2, -1
16; GCN-NEXT:    v_mov_b32_e32 v0, s4
17; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
18; GCN-NEXT:    s_endpgm
19;
20; GFX9-LABEL: uniform_vec_0_i16:
21; GFX9:       ; %bb.0:
22; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
23; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
24; GFX9-NEXT:    v_mov_b32_e32 v0, 0
25; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
26; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
27; GFX9-NEXT:    v_mov_b32_e32 v1, s2
28; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
29; GFX9-NEXT:    s_endpgm
30;
31; GFX906-LABEL: uniform_vec_0_i16:
32; GFX906:       ; %bb.0:
33; GFX906-NEXT:    s_load_dword s2, s[4:5], 0x2c
34; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
35; GFX906-NEXT:    v_mov_b32_e32 v0, 0
36; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX906-NEXT:    s_lshl_b32 s2, s2, 16
38; GFX906-NEXT:    v_mov_b32_e32 v1, s2
39; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
40; GFX906-NEXT:    s_endpgm
41;
42; GFX11-LABEL: uniform_vec_0_i16:
43; GFX11:       ; %bb.0:
44; GFX11-NEXT:    s_clause 0x1
45; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
46; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
47; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
49; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
50; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
51; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
52; GFX11-NEXT:    s_endpgm
53  %tmp = insertelement <2 x i16> undef, i16 0, i32 0
54  %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
55  %val = bitcast <2 x i16> %vec to i32
56  store i32 %val, ptr addrspace(1) %out, align 4
57  ret void
58}
59
60define i32 @divergent_vec_0_i16(i16 %a) {
61; GCN-LABEL: divergent_vec_0_i16:
62; GCN:       ; %bb.0:
63; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
65; GCN-NEXT:    s_setpc_b64 s[30:31]
66;
67; GFX9-LABEL: divergent_vec_0_i16:
68; GFX9:       ; %bb.0:
69; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
71; GFX9-NEXT:    s_setpc_b64 s[30:31]
72;
73; GFX906-LABEL: divergent_vec_0_i16:
74; GFX906:       ; %bb.0:
75; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
77; GFX906-NEXT:    s_setpc_b64 s[30:31]
78;
79; GFX11-LABEL: divergent_vec_0_i16:
80; GFX11:       ; %bb.0:
81; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
83; GFX11-NEXT:    s_setpc_b64 s[30:31]
84  %tmp = insertelement <2 x i16> undef, i16 0, i32 0
85  %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
86  %val = bitcast <2 x i16> %vec to i32
87  ret i32 %val
88}
89
90define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
91; GCN-LABEL: uniform_vec_i16_0:
92; GCN:       ; %bb.0:
93; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
94; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
95; GCN-NEXT:    s_mov_b32 s3, 0xf000
96; GCN-NEXT:    s_waitcnt lgkmcnt(0)
97; GCN-NEXT:    s_and_b32 s4, s2, 0xffff
98; GCN-NEXT:    s_mov_b32 s2, -1
99; GCN-NEXT:    v_mov_b32_e32 v0, s4
100; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
101; GCN-NEXT:    s_endpgm
102;
103; GFX9-LABEL: uniform_vec_i16_0:
104; GFX9:       ; %bb.0:
105; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
106; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
107; GFX9-NEXT:    v_mov_b32_e32 v0, 0
108; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
110; GFX9-NEXT:    v_mov_b32_e32 v1, s2
111; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
112; GFX9-NEXT:    s_endpgm
113;
114; GFX906-LABEL: uniform_vec_i16_0:
115; GFX906:       ; %bb.0:
116; GFX906-NEXT:    s_load_dword s2, s[4:5], 0x2c
117; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
118; GFX906-NEXT:    v_mov_b32_e32 v0, 0
119; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX906-NEXT:    s_and_b32 s2, 0xffff, s2
121; GFX906-NEXT:    v_mov_b32_e32 v1, s2
122; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
123; GFX906-NEXT:    s_endpgm
124;
125; GFX11-LABEL: uniform_vec_i16_0:
126; GFX11:       ; %bb.0:
127; GFX11-NEXT:    s_clause 0x1
128; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
129; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
130; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
132; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
133; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
134; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
135; GFX11-NEXT:    s_endpgm
136  %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
137  %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
138  %val = bitcast <2 x i16> %vec to i32
139  store i32 %val, ptr addrspace(1) %out, align 4
140  ret void
141}
142
143define i32 @divergent_vec_i16_0(i16 %a) {
144; GCN-LABEL: divergent_vec_i16_0:
145; GCN:       ; %bb.0:
146; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
148; GCN-NEXT:    s_setpc_b64 s[30:31]
149;
150; GFX9-LABEL: divergent_vec_i16_0:
151; GFX9:       ; %bb.0:
152; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
154; GFX9-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX906-LABEL: divergent_vec_i16_0:
157; GFX906:       ; %bb.0:
158; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
160; GFX906-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX11-LABEL: divergent_vec_i16_0:
163; GFX11:       ; %bb.0:
164; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
166; GFX11-NEXT:    s_setpc_b64 s[30:31]
167  %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
168  %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
169  %val = bitcast <2 x i16> %vec to i32
170  ret i32 %val
171}
172
173define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
174; GCN-LABEL: uniform_vec_f16_0:
175; GCN:       ; %bb.0:
176; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
177; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
178; GCN-NEXT:    s_mov_b32 s3, 0xf000
179; GCN-NEXT:    s_waitcnt lgkmcnt(0)
180; GCN-NEXT:    s_and_b32 s4, s2, 0xffff
181; GCN-NEXT:    s_mov_b32 s2, -1
182; GCN-NEXT:    v_mov_b32_e32 v0, s4
183; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
184; GCN-NEXT:    s_endpgm
185;
186; GFX9-LABEL: uniform_vec_f16_0:
187; GFX9:       ; %bb.0:
188; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
189; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
190; GFX9-NEXT:    v_mov_b32_e32 v0, 0
191; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
193; GFX9-NEXT:    v_mov_b32_e32 v1, s2
194; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
195; GFX9-NEXT:    s_endpgm
196;
197; GFX906-LABEL: uniform_vec_f16_0:
198; GFX906:       ; %bb.0:
199; GFX906-NEXT:    s_load_dword s2, s[4:5], 0x2c
200; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
201; GFX906-NEXT:    v_mov_b32_e32 v0, 0
202; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX906-NEXT:    s_and_b32 s2, 0xffff, s2
204; GFX906-NEXT:    v_mov_b32_e32 v1, s2
205; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
206; GFX906-NEXT:    s_endpgm
207;
208; GFX11-LABEL: uniform_vec_f16_0:
209; GFX11:       ; %bb.0:
210; GFX11-NEXT:    s_clause 0x1
211; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
212; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
213; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
215; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
216; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
217; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
218; GFX11-NEXT:    s_endpgm
219  %tmp = insertelement <2 x half> undef, half %a, i32 0
220  %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
221  %val = bitcast <2 x half> %vec to float
222  store float %val, ptr addrspace(1) %out, align 4
223  ret void
224}
225
226define float @divergent_vec_f16_0(half %a) {
227; GCN-LABEL: divergent_vec_f16_0:
228; GCN:       ; %bb.0:
229; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
231; GCN-NEXT:    s_setpc_b64 s[30:31]
232;
233; GFX9-LABEL: divergent_vec_f16_0:
234; GFX9:       ; %bb.0:
235; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
237; GFX9-NEXT:    s_setpc_b64 s[30:31]
238;
239; GFX906-LABEL: divergent_vec_f16_0:
240; GFX906:       ; %bb.0:
241; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
243; GFX906-NEXT:    s_setpc_b64 s[30:31]
244;
245; GFX11-LABEL: divergent_vec_f16_0:
246; GFX11:       ; %bb.0:
247; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
249; GFX11-NEXT:    s_setpc_b64 s[30:31]
250  %tmp = insertelement <2 x half> undef, half %a, i32 0
251  %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
252  %val = bitcast <2 x half> %vec to float
253  ret float %val
254}
255
256define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) {
257; GCN-LABEL: uniform_vec_i16_LL:
258; GCN:       ; %bb.0:
259; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
260; GCN-NEXT:    s_waitcnt lgkmcnt(0)
261; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
262; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
263; GCN-NEXT:    s_waitcnt lgkmcnt(0)
264; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
265; GCN-NEXT:    s_lshl_b32 s1, s1, 16
266; GCN-NEXT:    s_or_b32 s0, s0, s1
267; GCN-NEXT:    ;;#ASMSTART
268; GCN-NEXT:    ; use s0
269; GCN-NEXT:    ;;#ASMEND
270; GCN-NEXT:    s_endpgm
271;
272; GFX9-LABEL: uniform_vec_i16_LL:
273; GFX9:       ; %bb.0:
274; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
277; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
280; GFX9-NEXT:    ;;#ASMSTART
281; GFX9-NEXT:    ; use s0
282; GFX9-NEXT:    ;;#ASMEND
283; GFX9-NEXT:    s_endpgm
284;
285; GFX906-LABEL: uniform_vec_i16_LL:
286; GFX906:       ; %bb.0:
287; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
288; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
290; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
291; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
293; GFX906-NEXT:    ;;#ASMSTART
294; GFX906-NEXT:    ; use s0
295; GFX906-NEXT:    ;;#ASMEND
296; GFX906-NEXT:    s_endpgm
297;
298; GFX11-LABEL: uniform_vec_i16_LL:
299; GFX11:       ; %bb.0:
300; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
301; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
303; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
304; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
306; GFX11-NEXT:    ;;#ASMSTART
307; GFX11-NEXT:    ; use s0
308; GFX11-NEXT:    ;;#ASMEND
309; GFX11-NEXT:    s_endpgm
310  %val0 = load volatile i32, ptr addrspace(4) %in0
311  %val1 = load volatile i32, ptr addrspace(4) %in1
312  %lo = trunc i32 %val0 to i16
313  %hi = trunc i32 %val1 to i16
314  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
315  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
316  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
317  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
318  ret void
319}
320
321define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
322; GCN-LABEL: divergent_vec_i16_LL:
323; GCN:       ; %bb.0:
324; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
326; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
327; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
328; GCN-NEXT:    s_setpc_b64 s[30:31]
329;
330; GFX9-LABEL: divergent_vec_i16_LL:
331; GFX9:       ; %bb.0:
332; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
333; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
334; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
335; GFX9-NEXT:    s_setpc_b64 s[30:31]
336;
337; GFX906-LABEL: divergent_vec_i16_LL:
338; GFX906:       ; %bb.0:
339; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
341; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
342; GFX906-NEXT:    s_setpc_b64 s[30:31]
343;
344; GFX11-LABEL: divergent_vec_i16_LL:
345; GFX11:       ; %bb.0:
346; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
348; GFX11-NEXT:    s_setpc_b64 s[30:31]
349  %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
350  %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
351  %val = bitcast <2 x i16> %vec to i32
352  ret i32 %val
353}
354
355define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) {
356; GCN-LABEL: uniform_vec_i16_LH:
357; GCN:       ; %bb.0:
358; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
359; GCN-NEXT:    s_mov_b32 s7, 0xf000
360; GCN-NEXT:    s_waitcnt lgkmcnt(0)
361; GCN-NEXT:    s_and_b32 s3, s3, 0xffff0000
362; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
363; GCN-NEXT:    s_or_b32 s2, s2, s3
364; GCN-NEXT:    s_mov_b32 s6, -1
365; GCN-NEXT:    s_mov_b32 s4, s0
366; GCN-NEXT:    s_mov_b32 s5, s1
367; GCN-NEXT:    v_mov_b32_e32 v0, s2
368; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
369; GCN-NEXT:    s_endpgm
370;
371; GFX9-LABEL: uniform_vec_i16_LH:
372; GFX9:       ; %bb.0:
373; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
374; GFX9-NEXT:    v_mov_b32_e32 v0, 0
375; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
376; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
377; GFX9-NEXT:    v_mov_b32_e32 v1, s2
378; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
379; GFX9-NEXT:    s_endpgm
380;
381; GFX906-LABEL: uniform_vec_i16_LH:
382; GFX906:       ; %bb.0:
383; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
384; GFX906-NEXT:    v_mov_b32_e32 v0, 0
385; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
387; GFX906-NEXT:    v_mov_b32_e32 v1, s2
388; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
389; GFX906-NEXT:    s_endpgm
390;
391; GFX11-LABEL: uniform_vec_i16_LH:
392; GFX11:       ; %bb.0:
393; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
394; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX11-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
396; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
397; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
398; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
399; GFX11-NEXT:    s_endpgm
400  %shift = lshr i32 %b, 16
401  %tr = trunc i32 %shift to i16
402  %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
403  %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
404  %val = bitcast <2 x i16> %vec to i32
405  store i32 %val, ptr addrspace(1) %out, align 4
406  ret void
407}
408
409define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
410; GCN-LABEL: divergent_vec_i16_LH:
411; GCN:       ; %bb.0:
412; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413; GCN-NEXT:    s_mov_b32 s4, 0xffff
414; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
415; GCN-NEXT:    s_setpc_b64 s[30:31]
416;
417; GFX9-LABEL: divergent_vec_i16_LH:
418; GFX9:       ; %bb.0:
419; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420; GFX9-NEXT:    s_mov_b32 s4, 0xffff
421; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
422; GFX9-NEXT:    s_setpc_b64 s[30:31]
423;
424; GFX906-LABEL: divergent_vec_i16_LH:
425; GFX906:       ; %bb.0:
426; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427; GFX906-NEXT:    s_mov_b32 s4, 0xffff
428; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
429; GFX906-NEXT:    s_setpc_b64 s[30:31]
430;
431; GFX11-LABEL: divergent_vec_i16_LH:
432; GFX11:       ; %bb.0:
433; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
435; GFX11-NEXT:    s_setpc_b64 s[30:31]
436  %shift = lshr i32 %b, 16
437  %tr = trunc i32 %shift to i16
438  %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
439  %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
440  %val = bitcast <2 x i16> %vec to i32
441  ret i32 %val
442}
443
444define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) {
445; GCN-LABEL: uniform_vec_i16_HH:
446; GCN:       ; %bb.0:
447; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
448; GCN-NEXT:    s_mov_b32 s7, 0xf000
449; GCN-NEXT:    s_mov_b32 s6, -1
450; GCN-NEXT:    s_waitcnt lgkmcnt(0)
451; GCN-NEXT:    s_mov_b32 s4, s0
452; GCN-NEXT:    s_mov_b32 s5, s1
453; GCN-NEXT:    s_lshr_b32 s0, s3, 16
454; GCN-NEXT:    v_mov_b32_e32 v0, s2
455; GCN-NEXT:    v_alignbit_b32 v0, s0, v0, 16
456; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
457; GCN-NEXT:    s_endpgm
458;
459; GFX9-LABEL: uniform_vec_i16_HH:
460; GFX9:       ; %bb.0:
461; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
462; GFX9-NEXT:    v_mov_b32_e32 v0, 0
463; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
465; GFX9-NEXT:    v_mov_b32_e32 v1, s2
466; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
467; GFX9-NEXT:    s_endpgm
468;
469; GFX906-LABEL: uniform_vec_i16_HH:
470; GFX906:       ; %bb.0:
471; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
472; GFX906-NEXT:    v_mov_b32_e32 v0, 0
473; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
475; GFX906-NEXT:    v_mov_b32_e32 v1, s2
476; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
477; GFX906-NEXT:    s_endpgm
478;
479; GFX11-LABEL: uniform_vec_i16_HH:
480; GFX11:       ; %bb.0:
481; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
482; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX11-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
484; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
485; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
486; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
487; GFX11-NEXT:    s_endpgm
488  %shift_a = lshr i32 %a, 16
489  %tr_a = trunc i32 %shift_a to i16
490  %shift_b = lshr i32 %b, 16
491  %tr_b = trunc i32 %shift_b to i16
492  %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0
493  %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1
494  %val = bitcast <2 x i16> %vec to i32
495  store i32 %val, ptr addrspace(1) %out, align 4
496  ret void
497}
498
499define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
500; GCN-LABEL: divergent_vec_i16_HH:
501; GCN:       ; %bb.0:
502; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
503; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
504; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
505; GCN-NEXT:    s_setpc_b64 s[30:31]
506;
507; GFX9-LABEL: divergent_vec_i16_HH:
508; GFX9:       ; %bb.0:
509; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
511; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
512; GFX9-NEXT:    s_setpc_b64 s[30:31]
513;
514; GFX906-LABEL: divergent_vec_i16_HH:
515; GFX906:       ; %bb.0:
516; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517; GFX906-NEXT:    s_mov_b32 s4, 0x7060302
518; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
519; GFX906-NEXT:    s_setpc_b64 s[30:31]
520;
521; GFX11-LABEL: divergent_vec_i16_HH:
522; GFX11:       ; %bb.0:
523; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
524; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
525; GFX11-NEXT:    s_setpc_b64 s[30:31]
526  %shift_a = lshr i32 %a, 16
527  %tr_a = trunc i32 %shift_a to i16
528  %shift_b = lshr i32 %b, 16
529  %tr_b = trunc i32 %shift_b to i16
530  %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0
531  %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1
532  %val = bitcast <2 x i16> %vec to i32
533  ret i32 %val
534}
535
536define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) {
537; GCN-LABEL: uniform_vec_f16_LL:
538; GCN:       ; %bb.0:
539; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
540; GCN-NEXT:    s_waitcnt lgkmcnt(0)
541; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
542; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
543; GCN-NEXT:    s_waitcnt lgkmcnt(0)
544; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
545; GCN-NEXT:    s_lshl_b32 s1, s1, 16
546; GCN-NEXT:    s_or_b32 s0, s0, s1
547; GCN-NEXT:    ;;#ASMSTART
548; GCN-NEXT:    ; use s0
549; GCN-NEXT:    ;;#ASMEND
550; GCN-NEXT:    s_endpgm
551;
552; GFX9-LABEL: uniform_vec_f16_LL:
553; GFX9:       ; %bb.0:
554; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
555; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
556; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
557; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
558; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
560; GFX9-NEXT:    ;;#ASMSTART
561; GFX9-NEXT:    ; use s0
562; GFX9-NEXT:    ;;#ASMEND
563; GFX9-NEXT:    s_endpgm
564;
565; GFX906-LABEL: uniform_vec_f16_LL:
566; GFX906:       ; %bb.0:
567; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
568; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
570; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
571; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
573; GFX906-NEXT:    ;;#ASMSTART
574; GFX906-NEXT:    ; use s0
575; GFX906-NEXT:    ;;#ASMEND
576; GFX906-NEXT:    s_endpgm
577;
578; GFX11-LABEL: uniform_vec_f16_LL:
579; GFX11:       ; %bb.0:
580; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
581; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
583; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
584; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
585; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
586; GFX11-NEXT:    ;;#ASMSTART
587; GFX11-NEXT:    ; use s0
588; GFX11-NEXT:    ;;#ASMEND
589; GFX11-NEXT:    s_endpgm
590  %val0 = load volatile i32, ptr addrspace(4) %in0
591  %val1 = load volatile i32, ptr addrspace(4) %in1
592  %lo.i = trunc i32 %val0 to i16
593  %hi.i = trunc i32 %val1 to i16
594  %lo = bitcast i16 %lo.i to half
595  %hi = bitcast i16 %hi.i to half
596  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
597  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
598  %vec.i32 = bitcast <2 x half> %vec.1 to i32
599
600  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
601  ret void
602}
603
604define float @divergent_vec_f16_LL(half %a, half %b) {
605; GCN-LABEL: divergent_vec_f16_LL:
606; GCN:       ; %bb.0:
607; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
609; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
610; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
611; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
612; GCN-NEXT:    s_setpc_b64 s[30:31]
613;
614; GFX9-LABEL: divergent_vec_f16_LL:
615; GFX9:       ; %bb.0:
616; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
617; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
618; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
619; GFX9-NEXT:    s_setpc_b64 s[30:31]
620;
621; GFX906-LABEL: divergent_vec_f16_LL:
622; GFX906:       ; %bb.0:
623; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
624; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
625; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
626; GFX906-NEXT:    s_setpc_b64 s[30:31]
627;
628; GFX11-LABEL: divergent_vec_f16_LL:
629; GFX11:       ; %bb.0:
630; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
631; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
632; GFX11-NEXT:    s_setpc_b64 s[30:31]
633  %tmp = insertelement <2 x half> undef, half %a, i32 0
634  %vec = insertelement <2 x half> %tmp, half %b, i32 1
635  %val = bitcast <2 x half> %vec to float
636  ret float %val
637}
638
639define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 {
640; GCN-LABEL: build_vec_v2i16_undeflo_divergent:
641; GCN:       ; %bb.0: ; %entry
642; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643; GCN-NEXT:    s_mov_b32 m0, -1
644; GCN-NEXT:    ds_read_u16 v0, v0
645; GCN-NEXT:    s_waitcnt lgkmcnt(0)
646; GCN-NEXT:    s_setpc_b64 s[30:31]
647;
648; GFX9-LABEL: build_vec_v2i16_undeflo_divergent:
649; GFX9:       ; %bb.0: ; %entry
650; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX9-NEXT:    ds_read_u16_d16 v0, v0
652; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX9-NEXT:    s_setpc_b64 s[30:31]
654;
655; GFX906-LABEL: build_vec_v2i16_undeflo_divergent:
656; GFX906:       ; %bb.0: ; %entry
657; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; GFX906-NEXT:    ds_read_u16 v0, v0
659; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
660; GFX906-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX11-LABEL: build_vec_v2i16_undeflo_divergent:
663; GFX11:       ; %bb.0: ; %entry
664; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX11-NEXT:    ds_load_u16_d16 v0, v0
666; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX11-NEXT:    s_setpc_b64 s[30:31]
668entry:
669  %load = load i16, ptr addrspace(3) %in
670  %build = insertelement <2 x i16> undef, i16 %load, i32 0
671  ret <2 x i16> %build
672}
673
674define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
675; GCN-LABEL: build_vec_v2i16_undeflo_uniform:
676; GCN:       ; %bb.0: ; %entry
677; GCN-NEXT:    s_load_dword s2, s[4:5], 0x9
678; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
679; GCN-NEXT:    s_waitcnt lgkmcnt(0)
680; GCN-NEXT:    v_mov_b32_e32 v0, s2
681; GCN-NEXT:    s_mov_b32 m0, -1
682; GCN-NEXT:    ds_read_u16 v0, v0
683; GCN-NEXT:    s_mov_b32 s3, 0xf000
684; GCN-NEXT:    s_mov_b32 s2, -1
685; GCN-NEXT:    s_waitcnt lgkmcnt(0)
686; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
687; GCN-NEXT:    s_endpgm
688;
689; GFX9-LABEL: build_vec_v2i16_undeflo_uniform:
690; GFX9:       ; %bb.0: ; %entry
691; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x24
692; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
693; GFX9-NEXT:    v_mov_b32_e32 v1, 0
694; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
695; GFX9-NEXT:    v_mov_b32_e32 v0, s2
696; GFX9-NEXT:    ds_read_u16_d16 v0, v0
697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
699; GFX9-NEXT:    s_endpgm
700;
701; GFX906-LABEL: build_vec_v2i16_undeflo_uniform:
702; GFX906:       ; %bb.0: ; %entry
703; GFX906-NEXT:    s_load_dword s2, s[4:5], 0x24
704; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
705; GFX906-NEXT:    v_mov_b32_e32 v1, 0
706; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX906-NEXT:    v_mov_b32_e32 v0, s2
708; GFX906-NEXT:    ds_read_u16 v0, v0
709; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
710; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
711; GFX906-NEXT:    s_endpgm
712;
713; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
714; GFX11:       ; %bb.0: ; %entry
715; GFX11-NEXT:    s_clause 0x1
716; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
717; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
718; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
719; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
720; GFX11-NEXT:    ds_load_u16_d16 v0, v0
721; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
723; GFX11-NEXT:    s_endpgm
724entry:
725  %load = load i16, ptr addrspace(3) %in
726  %build = insertelement <2 x i16> undef, i16 %load, i32 0
727  %result = bitcast <2 x i16> %build to i32
728  store i32 %result, ptr addrspace(1) %out
729  ret void
730}
731