xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
3; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
7
8define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
9; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_mov_b32 s4, s0
16; SI-NEXT:    s_mov_b32 s5, s1
17; SI-NEXT:    v_mov_b32_e32 v0, s3
18; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
19; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    v_mov_b32_e32 v0, s3
27; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s2, v0
28; VI-NEXT:    v_mov_b32_e32 v0, s0
29; VI-NEXT:    v_mov_b32_e32 v1, s1
30; VI-NEXT:    flat_store_dword v[0:1], v2
31; VI-NEXT:    s_endpgm
32;
33; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
34; GFX9:       ; %bb.0:
35; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
36; GFX9-NEXT:    v_mov_b32_e32 v0, 0
37; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX9-NEXT:    v_mov_b32_e32 v1, s3
39; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, v1
40; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
41; GFX9-NEXT:    s_endpgm
42;
43; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
44; GFX10:       ; %bb.0:
45; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
46; GFX10-NEXT:    v_mov_b32_e32 v0, 0
47; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
49; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
50; GFX10-NEXT:    s_endpgm
51;
52; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
53; GFX11:       ; %bb.0:
54; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
55; GFX11-NEXT:    v_mov_b32_e32 v0, 0
56; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3
58; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
59; GFX11-NEXT:    s_endpgm
60  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
61  store <2 x half> %result, ptr addrspace(1) %out
62  ret void
63}
64
65define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 {
66; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
67; SI:       ; %bb.0:
68; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
69; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
70; SI-NEXT:    s_mov_b32 s3, 0xf000
71; SI-NEXT:    s_mov_b32 s2, -1
72; SI-NEXT:    s_waitcnt lgkmcnt(0)
73; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v0, s6, s6
74; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; SI-NEXT:    s_endpgm
76;
77; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
78; VI:       ; %bb.0:
79; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
80; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s2, s2
83; VI-NEXT:    v_mov_b32_e32 v0, s0
84; VI-NEXT:    v_mov_b32_e32 v1, s1
85; VI-NEXT:    flat_store_dword v[0:1], v2
86; VI-NEXT:    s_endpgm
87;
88; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
89; GFX9:       ; %bb.0:
90; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
91; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
92; GFX9-NEXT:    v_mov_b32_e32 v0, 0
93; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, s2
95; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
96; GFX9-NEXT:    s_endpgm
97;
98; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
99; GFX10:       ; %bb.0:
100; GFX10-NEXT:    s_clause 0x1
101; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
102; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
103; GFX10-NEXT:    v_mov_b32_e32 v0, 0
104; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s2
106; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
107; GFX10-NEXT:    s_endpgm
108;
109; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
110; GFX11:       ; %bb.0:
111; GFX11-NEXT:    s_clause 0x1
112; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
113; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
114; GFX11-NEXT:    v_mov_b32_e32 v0, 0
115; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2
117; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
118; GFX11-NEXT:    s_endpgm
119  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
120  store <2 x half> %result, ptr addrspace(1) %out
121  ret void
122}
123
124define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 {
125; GCN-LABEL: s_cvt_pkrtz_undef_undef:
126; GCN:       ; %bb.0:
127; GCN-NEXT:    s_endpgm
128;
129; GFX10-LABEL: s_cvt_pkrtz_undef_undef:
130; GFX10:       ; %bb.0:
131; GFX10-NEXT:    s_endpgm
132;
133; GFX11-LABEL: s_cvt_pkrtz_undef_undef:
134; GFX11:       ; %bb.0:
135; GFX11-NEXT:    s_endpgm
136  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
137  store <2 x half> %result, ptr addrspace(1) %out
138  ret void
139}
140
141define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
142; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
143; SI:       ; %bb.0:
144; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
145; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
146; SI-NEXT:    s_mov_b32 s11, 0xf000
147; SI-NEXT:    s_mov_b32 s10, 0
148; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
149; SI-NEXT:    v_mov_b32_e32 v1, 0
150; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
151; SI-NEXT:    s_waitcnt lgkmcnt(0)
152; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
153; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
154; SI-NEXT:    s_waitcnt vmcnt(0)
155; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
158; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
159; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
160; SI-NEXT:    s_endpgm
161;
162; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
163; VI:       ; %bb.0:
164; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
165; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
166; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    v_mov_b32_e32 v1, s3
169; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
170; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
171; VI-NEXT:    v_mov_b32_e32 v3, s5
172; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
173; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
174; VI-NEXT:    flat_load_dword v5, v[0:1] glc
175; VI-NEXT:    s_waitcnt vmcnt(0)
176; VI-NEXT:    flat_load_dword v2, v[2:3] glc
177; VI-NEXT:    s_waitcnt vmcnt(0)
178; VI-NEXT:    v_mov_b32_e32 v1, s1
179; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
180; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
181; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, v5, v2
182; VI-NEXT:    flat_store_dword v[0:1], v2
183; VI-NEXT:    s_endpgm
184;
185; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
186; GFX9:       ; %bb.0:
187; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
188; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
189; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
190; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
192; GFX9-NEXT:    s_waitcnt vmcnt(0)
193; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
194; GFX9-NEXT:    s_waitcnt vmcnt(0)
195; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, v2
196; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
197; GFX9-NEXT:    s_endpgm
198;
199; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32:
200; GFX10:       ; %bb.0:
201; GFX10-NEXT:    s_clause 0x1
202; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
203; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
204; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
207; GFX10-NEXT:    s_waitcnt vmcnt(0)
208; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
209; GFX10-NEXT:    s_waitcnt vmcnt(0)
210; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e32 v1, v1, v2
211; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
212; GFX10-NEXT:    s_endpgm
213;
214; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32:
215; GFX11:       ; %bb.0:
216; GFX11-NEXT:    s_clause 0x1
217; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
218; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
219; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
220; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
221; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
222; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
224; GFX11-NEXT:    s_waitcnt vmcnt(0)
225; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
226; GFX11-NEXT:    s_waitcnt vmcnt(0)
227; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2
228; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
229; GFX11-NEXT:    s_endpgm
230  %tid = call i32 @llvm.amdgcn.workitem.id.x()
231  %tid.ext = sext i32 %tid to i64
232  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
233  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
234  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
235  %a = load volatile float, ptr addrspace(1) %a.gep
236  %b = load volatile float, ptr addrspace(1) %b.gep
237  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
238  store <2 x half> %cvt, ptr addrspace(1) %out.gep
239  ret void
240}
241
242define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
243; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
244; SI:       ; %bb.0:
245; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
246; SI-NEXT:    s_mov_b32 s7, 0xf000
247; SI-NEXT:    s_mov_b32 s6, 0
248; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
249; SI-NEXT:    v_mov_b32_e32 v1, 0
250; SI-NEXT:    s_waitcnt lgkmcnt(0)
251; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
252; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
253; SI-NEXT:    s_waitcnt vmcnt(0)
254; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
255; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
256; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
257; SI-NEXT:    s_endpgm
258;
259; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
260; VI:       ; %bb.0:
261; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
262; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
263; VI-NEXT:    s_waitcnt lgkmcnt(0)
264; VI-NEXT:    v_mov_b32_e32 v1, s3
265; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
266; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
267; VI-NEXT:    flat_load_dword v3, v[0:1] glc
268; VI-NEXT:    s_waitcnt vmcnt(0)
269; VI-NEXT:    v_mov_b32_e32 v1, s1
270; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
271; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
272; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, v3, 1.0
273; VI-NEXT:    flat_store_dword v[0:1], v2
274; VI-NEXT:    s_endpgm
275;
276; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
277; GFX9:       ; %bb.0:
278; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
279; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
282; GFX9-NEXT:    s_waitcnt vmcnt(0)
283; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, 1.0
284; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
285; GFX9-NEXT:    s_endpgm
286;
287; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
288; GFX10:       ; %bb.0:
289; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
290; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
291; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
293; GFX10-NEXT:    s_waitcnt vmcnt(0)
294; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
295; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
296; GFX10-NEXT:    s_endpgm
297;
298; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
299; GFX11:       ; %bb.0:
300; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
301; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
302; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
303; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
304; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
306; GFX11-NEXT:    s_waitcnt vmcnt(0)
307; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0
308; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
309; GFX11-NEXT:    s_endpgm
310  %tid = call i32 @llvm.amdgcn.workitem.id.x()
311  %tid.ext = sext i32 %tid to i64
312  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
313  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
314  %a = load volatile float, ptr addrspace(1) %a.gep
315  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
316  store <2 x half> %cvt, ptr addrspace(1) %out.gep
317  ret void
318}
319
320define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
321; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
322; SI:       ; %bb.0:
323; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
324; SI-NEXT:    s_mov_b32 s7, 0xf000
325; SI-NEXT:    s_mov_b32 s6, 0
326; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
327; SI-NEXT:    v_mov_b32_e32 v1, 0
328; SI-NEXT:    s_waitcnt lgkmcnt(0)
329; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
330; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
331; SI-NEXT:    s_waitcnt vmcnt(0)
332; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
333; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
334; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
335; SI-NEXT:    s_endpgm
336;
337; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
338; VI:       ; %bb.0:
339; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
340; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
341; VI-NEXT:    s_waitcnt lgkmcnt(0)
342; VI-NEXT:    v_mov_b32_e32 v1, s3
343; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
344; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
345; VI-NEXT:    flat_load_dword v3, v[0:1] glc
346; VI-NEXT:    s_waitcnt vmcnt(0)
347; VI-NEXT:    v_mov_b32_e32 v1, s1
348; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
349; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
350; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, 1.0, v3
351; VI-NEXT:    flat_store_dword v[0:1], v2
352; VI-NEXT:    s_endpgm
353;
354; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
355; GFX9:       ; %bb.0:
356; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
357; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
358; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
360; GFX9-NEXT:    s_waitcnt vmcnt(0)
361; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, 1.0, v1
362; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
363; GFX9-NEXT:    s_endpgm
364;
365; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
366; GFX10:       ; %bb.0:
367; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
368; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
369; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
371; GFX10-NEXT:    s_waitcnt vmcnt(0)
372; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
373; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
374; GFX10-NEXT:    s_endpgm
375;
376; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
377; GFX11:       ; %bb.0:
378; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
379; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
380; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
381; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
382; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
384; GFX11-NEXT:    s_waitcnt vmcnt(0)
385; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1
386; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
387; GFX11-NEXT:    s_endpgm
388  %tid = call i32 @llvm.amdgcn.workitem.id.x()
389  %tid.ext = sext i32 %tid to i64
390  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
391  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
392  %a = load volatile float, ptr addrspace(1) %a.gep
393  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
394  store <2 x half> %cvt, ptr addrspace(1) %out.gep
395  ret void
396}
397
398define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
399; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
400; SI:       ; %bb.0:
401; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
402; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
403; SI-NEXT:    s_mov_b32 s11, 0xf000
404; SI-NEXT:    s_mov_b32 s10, 0
405; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
406; SI-NEXT:    v_mov_b32_e32 v1, 0
407; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
408; SI-NEXT:    s_waitcnt lgkmcnt(0)
409; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
410; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
411; SI-NEXT:    s_waitcnt vmcnt(0)
412; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
413; SI-NEXT:    s_waitcnt vmcnt(0)
414; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
415; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
416; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
417; SI-NEXT:    s_endpgm
418;
419; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
420; VI:       ; %bb.0:
421; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
422; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
423; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
424; VI-NEXT:    s_waitcnt lgkmcnt(0)
425; VI-NEXT:    v_mov_b32_e32 v1, s3
426; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
427; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
428; VI-NEXT:    v_mov_b32_e32 v3, s5
429; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
430; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
431; VI-NEXT:    flat_load_dword v5, v[0:1] glc
432; VI-NEXT:    s_waitcnt vmcnt(0)
433; VI-NEXT:    flat_load_dword v2, v[2:3] glc
434; VI-NEXT:    s_waitcnt vmcnt(0)
435; VI-NEXT:    v_mov_b32_e32 v1, s1
436; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
437; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
438; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, -v5, v2
439; VI-NEXT:    flat_store_dword v[0:1], v2
440; VI-NEXT:    s_endpgm
441;
442; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
445; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
446; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
447; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
448; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
449; GFX9-NEXT:    s_waitcnt vmcnt(0)
450; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
451; GFX9-NEXT:    s_waitcnt vmcnt(0)
452; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -v1, v2
453; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
454; GFX9-NEXT:    s_endpgm
455;
456; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
457; GFX10:       ; %bb.0:
458; GFX10-NEXT:    s_clause 0x1
459; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
460; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
461; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
462; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
464; GFX10-NEXT:    s_waitcnt vmcnt(0)
465; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
466; GFX10-NEXT:    s_waitcnt vmcnt(0)
467; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2
468; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
469; GFX10-NEXT:    s_endpgm
470;
471; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
472; GFX11:       ; %bb.0:
473; GFX11-NEXT:    s_clause 0x1
474; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
475; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
476; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
477; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
478; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
479; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
481; GFX11-NEXT:    s_waitcnt vmcnt(0)
482; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
483; GFX11-NEXT:    s_waitcnt vmcnt(0)
484; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2
485; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
486; GFX11-NEXT:    s_endpgm
487  %tid = call i32 @llvm.amdgcn.workitem.id.x()
488  %tid.ext = sext i32 %tid to i64
489  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
490  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
491  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
492  %a = load volatile float, ptr addrspace(1) %a.gep
493  %b = load volatile float, ptr addrspace(1) %b.gep
494  %neg.a = fsub float -0.0, %a
495  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
496  store <2 x half> %cvt, ptr addrspace(1) %out.gep
497  ret void
498}
499
500define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
501; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
502; SI:       ; %bb.0:
503; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
504; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
505; SI-NEXT:    s_mov_b32 s11, 0xf000
506; SI-NEXT:    s_mov_b32 s10, 0
507; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
508; SI-NEXT:    v_mov_b32_e32 v1, 0
509; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
510; SI-NEXT:    s_waitcnt lgkmcnt(0)
511; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
512; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
513; SI-NEXT:    s_waitcnt vmcnt(0)
514; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
515; SI-NEXT:    s_waitcnt vmcnt(0)
516; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
517; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
518; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
519; SI-NEXT:    s_endpgm
520;
521; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
522; VI:       ; %bb.0:
523; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
524; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
525; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
526; VI-NEXT:    s_waitcnt lgkmcnt(0)
527; VI-NEXT:    v_mov_b32_e32 v1, s3
528; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
529; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
530; VI-NEXT:    v_mov_b32_e32 v3, s5
531; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
532; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
533; VI-NEXT:    flat_load_dword v5, v[0:1] glc
534; VI-NEXT:    s_waitcnt vmcnt(0)
535; VI-NEXT:    flat_load_dword v2, v[2:3] glc
536; VI-NEXT:    s_waitcnt vmcnt(0)
537; VI-NEXT:    v_mov_b32_e32 v1, s1
538; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
539; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
540; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, v5, -v2
541; VI-NEXT:    flat_store_dword v[0:1], v2
542; VI-NEXT:    s_endpgm
543;
544; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
545; GFX9:       ; %bb.0:
546; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
547; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
548; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
549; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
551; GFX9-NEXT:    s_waitcnt vmcnt(0)
552; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
553; GFX9-NEXT:    s_waitcnt vmcnt(0)
554; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, -v2
555; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
556; GFX9-NEXT:    s_endpgm
557;
558; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
559; GFX10:       ; %bb.0:
560; GFX10-NEXT:    s_clause 0x1
561; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
562; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
563; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
564; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
566; GFX10-NEXT:    s_waitcnt vmcnt(0)
567; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
568; GFX10-NEXT:    s_waitcnt vmcnt(0)
569; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2
570; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
571; GFX10-NEXT:    s_endpgm
572;
573; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
574; GFX11:       ; %bb.0:
575; GFX11-NEXT:    s_clause 0x1
576; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
577; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
578; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
579; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
580; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
581; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
583; GFX11-NEXT:    s_waitcnt vmcnt(0)
584; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
585; GFX11-NEXT:    s_waitcnt vmcnt(0)
586; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2
587; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
588; GFX11-NEXT:    s_endpgm
589  %tid = call i32 @llvm.amdgcn.workitem.id.x()
590  %tid.ext = sext i32 %tid to i64
591  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
592  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
593  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
594  %a = load volatile float, ptr addrspace(1) %a.gep
595  %b = load volatile float, ptr addrspace(1) %b.gep
596  %neg.b = fsub float -0.0, %b
597  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
598  store <2 x half> %cvt, ptr addrspace(1) %out.gep
599  ret void
600}
601
602define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
603; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
604; SI:       ; %bb.0:
605; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
606; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
607; SI-NEXT:    s_mov_b32 s11, 0xf000
608; SI-NEXT:    s_mov_b32 s10, 0
609; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
610; SI-NEXT:    v_mov_b32_e32 v1, 0
611; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
612; SI-NEXT:    s_waitcnt lgkmcnt(0)
613; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
614; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
615; SI-NEXT:    s_waitcnt vmcnt(0)
616; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
617; SI-NEXT:    s_waitcnt vmcnt(0)
618; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
619; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
620; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
621; SI-NEXT:    s_endpgm
622;
623; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
624; VI:       ; %bb.0:
625; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
626; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
627; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
628; VI-NEXT:    s_waitcnt lgkmcnt(0)
629; VI-NEXT:    v_mov_b32_e32 v1, s3
630; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
631; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
632; VI-NEXT:    v_mov_b32_e32 v3, s5
633; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
634; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
635; VI-NEXT:    flat_load_dword v5, v[0:1] glc
636; VI-NEXT:    s_waitcnt vmcnt(0)
637; VI-NEXT:    flat_load_dword v2, v[2:3] glc
638; VI-NEXT:    s_waitcnt vmcnt(0)
639; VI-NEXT:    v_mov_b32_e32 v1, s1
640; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
641; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
642; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, -v5, -v2
643; VI-NEXT:    flat_store_dword v[0:1], v2
644; VI-NEXT:    s_endpgm
645;
646; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
647; GFX9:       ; %bb.0:
648; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
649; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
650; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
651; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
653; GFX9-NEXT:    s_waitcnt vmcnt(0)
654; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
655; GFX9-NEXT:    s_waitcnt vmcnt(0)
656; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -v1, -v2
657; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
658; GFX9-NEXT:    s_endpgm
659;
660; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
661; GFX10:       ; %bb.0:
662; GFX10-NEXT:    s_clause 0x1
663; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
664; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
665; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
666; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
668; GFX10-NEXT:    s_waitcnt vmcnt(0)
669; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
670; GFX10-NEXT:    s_waitcnt vmcnt(0)
671; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2
672; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
673; GFX10-NEXT:    s_endpgm
674;
675; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
676; GFX11:       ; %bb.0:
677; GFX11-NEXT:    s_clause 0x1
678; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
679; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
680; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
681; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
682; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
683; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
685; GFX11-NEXT:    s_waitcnt vmcnt(0)
686; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
687; GFX11-NEXT:    s_waitcnt vmcnt(0)
688; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2
689; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
690; GFX11-NEXT:    s_endpgm
691  %tid = call i32 @llvm.amdgcn.workitem.id.x()
692  %tid.ext = sext i32 %tid to i64
693  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
694  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
695  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
696  %a = load volatile float, ptr addrspace(1) %a.gep
697  %b = load volatile float, ptr addrspace(1) %b.gep
698  %neg.a = fsub float -0.0, %a
699  %neg.b = fsub float -0.0, %b
700  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
701  store <2 x half> %cvt, ptr addrspace(1) %out.gep
702  ret void
703}
704
705define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
706; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
707; SI:       ; %bb.0:
708; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
709; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
710; SI-NEXT:    s_mov_b32 s11, 0xf000
711; SI-NEXT:    s_mov_b32 s10, 0
712; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
713; SI-NEXT:    v_mov_b32_e32 v1, 0
714; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
715; SI-NEXT:    s_waitcnt lgkmcnt(0)
716; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
717; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
718; SI-NEXT:    s_waitcnt vmcnt(0)
719; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
720; SI-NEXT:    s_waitcnt vmcnt(0)
721; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
722; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
723; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
724; SI-NEXT:    s_endpgm
725;
726; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
727; VI:       ; %bb.0:
728; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
729; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
730; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
731; VI-NEXT:    s_waitcnt lgkmcnt(0)
732; VI-NEXT:    v_mov_b32_e32 v1, s3
733; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
734; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
735; VI-NEXT:    v_mov_b32_e32 v3, s5
736; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
737; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
738; VI-NEXT:    flat_load_dword v5, v[0:1] glc
739; VI-NEXT:    s_waitcnt vmcnt(0)
740; VI-NEXT:    flat_load_dword v2, v[2:3] glc
741; VI-NEXT:    s_waitcnt vmcnt(0)
742; VI-NEXT:    v_mov_b32_e32 v1, s1
743; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
744; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
745; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
746; VI-NEXT:    flat_store_dword v[0:1], v2
747; VI-NEXT:    s_endpgm
748;
749; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
750; GFX9:       ; %bb.0:
751; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
752; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
753; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
756; GFX9-NEXT:    s_waitcnt vmcnt(0)
757; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
758; GFX9-NEXT:    s_waitcnt vmcnt(0)
759; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
760; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
761; GFX9-NEXT:    s_endpgm
762;
763; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
764; GFX10:       ; %bb.0:
765; GFX10-NEXT:    s_clause 0x1
766; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
767; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
768; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
769; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
771; GFX10-NEXT:    s_waitcnt vmcnt(0)
772; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
773; GFX10-NEXT:    s_waitcnt vmcnt(0)
774; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2
775; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
776; GFX10-NEXT:    s_endpgm
777;
778; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
779; GFX11:       ; %bb.0:
780; GFX11-NEXT:    s_clause 0x1
781; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
782; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
783; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
784; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
785; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
786; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
788; GFX11-NEXT:    s_waitcnt vmcnt(0)
789; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
790; GFX11-NEXT:    s_waitcnt vmcnt(0)
791; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2
792; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
793; GFX11-NEXT:    s_endpgm
794  %tid = call i32 @llvm.amdgcn.workitem.id.x()
795  %tid.ext = sext i32 %tid to i64
796  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
797  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
798  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
799  %a = load volatile float, ptr addrspace(1) %a.gep
800  %b = load volatile float, ptr addrspace(1) %b.gep
801  %fabs.a = call float @llvm.fabs.f32(float %a)
802  %neg.fabs.a = fsub float -0.0, %fabs.a
803  %neg.b = fsub float -0.0, %b
804  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
805  store <2 x half> %cvt, ptr addrspace(1) %out.gep
806  ret void
807}
808
809declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
810declare float @llvm.fabs.f32(float) #1
811declare i32 @llvm.amdgcn.workitem.id.x() #1
812
813
814attributes #0 = { nounwind }
815attributes #1 = { nounwind readnone }
816