xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fabs.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6
7; DAGCombiner will transform:
8; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
9; unless isFabsFree returns true
10
11define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
12; CI-LABEL: s_fabs_free_f16:
13; CI:       ; %bb.0:
14; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
15; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
16; CI-NEXT:    s_waitcnt lgkmcnt(0)
17; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
18; CI-NEXT:    v_mov_b32_e32 v0, s0
19; CI-NEXT:    v_mov_b32_e32 v1, s1
20; CI-NEXT:    v_mov_b32_e32 v2, s2
21; CI-NEXT:    flat_store_short v[0:1], v2
22; CI-NEXT:    s_endpgm
23;
24; VI-LABEL: s_fabs_free_f16:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
27; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
28; VI-NEXT:    s_waitcnt lgkmcnt(0)
29; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
30; VI-NEXT:    v_mov_b32_e32 v0, s0
31; VI-NEXT:    v_mov_b32_e32 v1, s1
32; VI-NEXT:    v_mov_b32_e32 v2, s2
33; VI-NEXT:    flat_store_short v[0:1], v2
34; VI-NEXT:    s_endpgm
35;
36; GFX9-LABEL: s_fabs_free_f16:
37; GFX9:       ; %bb.0:
38; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
39; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
40; GFX9-NEXT:    v_mov_b32_e32 v0, 0
41; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
43; GFX9-NEXT:    v_mov_b32_e32 v1, s2
44; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
45; GFX9-NEXT:    s_endpgm
46;
47; GFX11-LABEL: s_fabs_free_f16:
48; GFX11:       ; %bb.0:
49; GFX11-NEXT:    s_clause 0x1
50; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
51; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
52; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff
54; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
55; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
56; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
57; GFX11-NEXT:    s_endpgm
58  %bc= bitcast i16 %in to half
59  %fabs = call half @llvm.fabs.f16(half %bc)
60  store half %fabs, ptr addrspace(1) %out
61  ret void
62}
63
64define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
65; CI-LABEL: s_fabs_f16:
66; CI:       ; %bb.0:
67; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
68; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
69; CI-NEXT:    s_waitcnt lgkmcnt(0)
70; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
71; CI-NEXT:    v_mov_b32_e32 v0, s0
72; CI-NEXT:    v_mov_b32_e32 v1, s1
73; CI-NEXT:    v_mov_b32_e32 v2, s2
74; CI-NEXT:    flat_store_short v[0:1], v2
75; CI-NEXT:    s_endpgm
76;
77; VI-LABEL: s_fabs_f16:
78; VI:       ; %bb.0:
79; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
80; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
83; VI-NEXT:    v_mov_b32_e32 v0, s0
84; VI-NEXT:    v_mov_b32_e32 v1, s1
85; VI-NEXT:    v_mov_b32_e32 v2, s2
86; VI-NEXT:    flat_store_short v[0:1], v2
87; VI-NEXT:    s_endpgm
88;
89; GFX9-LABEL: s_fabs_f16:
90; GFX9:       ; %bb.0:
91; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
92; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
93; GFX9-NEXT:    v_mov_b32_e32 v0, 0
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
96; GFX9-NEXT:    v_mov_b32_e32 v1, s2
97; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
98; GFX9-NEXT:    s_endpgm
99;
100; GFX11-LABEL: s_fabs_f16:
101; GFX11:       ; %bb.0:
102; GFX11-NEXT:    s_clause 0x1
103; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
104; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
105; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff
107; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
108; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
109; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
110; GFX11-NEXT:    s_endpgm
111  %fabs = call half @llvm.fabs.f16(half %in)
112  store half %fabs, ptr addrspace(1) %out
113  ret void
114}
115
116define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
117; CI-LABEL: s_fabs_v2f16:
118; CI:       ; %bb.0:
119; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
120; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
121; CI-NEXT:    s_waitcnt lgkmcnt(0)
122; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
123; CI-NEXT:    v_mov_b32_e32 v0, s0
124; CI-NEXT:    v_mov_b32_e32 v1, s1
125; CI-NEXT:    v_mov_b32_e32 v2, s2
126; CI-NEXT:    flat_store_dword v[0:1], v2
127; CI-NEXT:    s_endpgm
128;
129; VI-LABEL: s_fabs_v2f16:
130; VI:       ; %bb.0:
131; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
132; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
133; VI-NEXT:    s_waitcnt lgkmcnt(0)
134; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
135; VI-NEXT:    v_mov_b32_e32 v0, s0
136; VI-NEXT:    v_mov_b32_e32 v1, s1
137; VI-NEXT:    v_mov_b32_e32 v2, s2
138; VI-NEXT:    flat_store_dword v[0:1], v2
139; VI-NEXT:    s_endpgm
140;
141; GFX9-LABEL: s_fabs_v2f16:
142; GFX9:       ; %bb.0:
143; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
144; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
145; GFX9-NEXT:    v_mov_b32_e32 v0, 0
146; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
148; GFX9-NEXT:    v_mov_b32_e32 v1, s2
149; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
150; GFX9-NEXT:    s_endpgm
151;
152; GFX11-LABEL: s_fabs_v2f16:
153; GFX11:       ; %bb.0:
154; GFX11-NEXT:    s_clause 0x1
155; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
156; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
157; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
159; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
160; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
161; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
162; GFX11-NEXT:    s_endpgm
163  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
164  store <2 x half> %fabs, ptr addrspace(1) %out
165  ret void
166}
167
168define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
169; CI-LABEL: s_fabs_v4f16:
170; CI:       ; %bb.0:
171; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
172; CI-NEXT:    s_waitcnt lgkmcnt(0)
173; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
174; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
175; CI-NEXT:    v_mov_b32_e32 v3, s1
176; CI-NEXT:    v_mov_b32_e32 v0, s2
177; CI-NEXT:    v_mov_b32_e32 v1, s3
178; CI-NEXT:    v_mov_b32_e32 v2, s0
179; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
180; CI-NEXT:    s_endpgm
181;
182; VI-LABEL: s_fabs_v4f16:
183; VI:       ; %bb.0:
184; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
185; VI-NEXT:    s_waitcnt lgkmcnt(0)
186; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
187; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
188; VI-NEXT:    v_mov_b32_e32 v3, s1
189; VI-NEXT:    v_mov_b32_e32 v0, s2
190; VI-NEXT:    v_mov_b32_e32 v1, s3
191; VI-NEXT:    v_mov_b32_e32 v2, s0
192; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
193; VI-NEXT:    s_endpgm
194;
195; GFX9-LABEL: s_fabs_v4f16:
196; GFX9:       ; %bb.0:
197; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
198; GFX9-NEXT:    v_mov_b32_e32 v2, 0
199; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX9-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
201; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
202; GFX9-NEXT:    v_mov_b32_e32 v0, s2
203; GFX9-NEXT:    v_mov_b32_e32 v1, s3
204; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
205; GFX9-NEXT:    s_endpgm
206;
207; GFX11-LABEL: s_fabs_v4f16:
208; GFX11:       ; %bb.0:
209; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
210; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
212; GFX11-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
213; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
214; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
215; GFX11-NEXT:    v_mov_b32_e32 v0, s2
216; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
217; GFX11-NEXT:    s_endpgm
218  %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
219  store <4 x half> %fabs, ptr addrspace(1) %out
220  ret void
221}
222
223define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) {
224; CI-LABEL: fabs_fold_f16:
225; CI:       ; %bb.0:
226; CI-NEXT:    s_load_dword s0, s[8:9], 0x2
227; CI-NEXT:    s_waitcnt lgkmcnt(0)
228; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
229; CI-NEXT:    s_lshr_b32 s0, s0, 16
230; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
231; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
232; CI-NEXT:    v_mul_f32_e32 v0, v0, v1
233; CI-NEXT:    v_cvt_f16_f32_e32 v2, v0
234; CI-NEXT:    s_waitcnt lgkmcnt(0)
235; CI-NEXT:    v_mov_b32_e32 v0, s0
236; CI-NEXT:    v_mov_b32_e32 v1, s1
237; CI-NEXT:    flat_store_short v[0:1], v2
238; CI-NEXT:    s_endpgm
239;
240; VI-LABEL: fabs_fold_f16:
241; VI:       ; %bb.0:
242; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
243; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
244; VI-NEXT:    s_waitcnt lgkmcnt(0)
245; VI-NEXT:    s_lshr_b32 s3, s2, 16
246; VI-NEXT:    v_mov_b32_e32 v0, s3
247; VI-NEXT:    v_mul_f16_e64 v2, |s2|, v0
248; VI-NEXT:    v_mov_b32_e32 v0, s0
249; VI-NEXT:    v_mov_b32_e32 v1, s1
250; VI-NEXT:    flat_store_short v[0:1], v2
251; VI-NEXT:    s_endpgm
252;
253; GFX9-LABEL: fabs_fold_f16:
254; GFX9:       ; %bb.0:
255; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
256; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
257; GFX9-NEXT:    v_mov_b32_e32 v0, 0
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
260; GFX9-NEXT:    v_mov_b32_e32 v1, s3
261; GFX9-NEXT:    v_mul_f16_e64 v1, |s2|, v1
262; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
263; GFX9-NEXT:    s_endpgm
264;
265; GFX11-LABEL: fabs_fold_f16:
266; GFX11:       ; %bb.0:
267; GFX11-NEXT:    s_clause 0x1
268; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
269; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
270; GFX11-NEXT:    v_mov_b32_e32 v0, 0
271; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
273; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
274; GFX11-NEXT:    v_mul_f16_e64 v1, |s2|, s3
275; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
276; GFX11-NEXT:    s_endpgm
277  %fabs = call half @llvm.fabs.f16(half %in0)
278  %fmul = fmul half %fabs, %in1
279  store half %fmul, ptr addrspace(1) %out
280  ret void
281}
282
283define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
284; CI-LABEL: v_fabs_v2f16:
285; CI:       ; %bb.0:
286; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
287; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
288; CI-NEXT:    s_waitcnt lgkmcnt(0)
289; CI-NEXT:    v_mov_b32_e32 v1, s1
290; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
291; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
292; CI-NEXT:    flat_load_dword v2, v[0:1]
293; CI-NEXT:    s_waitcnt vmcnt(0)
294; CI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
295; CI-NEXT:    flat_store_dword v[0:1], v2
296; CI-NEXT:    s_endpgm
297;
298; VI-LABEL: v_fabs_v2f16:
299; VI:       ; %bb.0:
300; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
301; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
302; VI-NEXT:    s_waitcnt lgkmcnt(0)
303; VI-NEXT:    v_mov_b32_e32 v1, s1
304; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
305; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
306; VI-NEXT:    flat_load_dword v2, v[0:1]
307; VI-NEXT:    s_waitcnt vmcnt(0)
308; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
309; VI-NEXT:    flat_store_dword v[0:1], v2
310; VI-NEXT:    s_endpgm
311;
312; GFX9-LABEL: v_fabs_v2f16:
313; GFX9:       ; %bb.0:
314; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
315; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
316; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
317; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
318; GFX9-NEXT:    s_waitcnt vmcnt(0)
319; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
320; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
321; GFX9-NEXT:    s_endpgm
322;
323; GFX11-LABEL: v_fabs_v2f16:
324; GFX11:       ; %bb.0:
325; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
326; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
327; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
328; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
329; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
331; GFX11-NEXT:    s_waitcnt vmcnt(0)
332; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
333; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
334; GFX11-NEXT:    s_endpgm
335  %tid = call i32 @llvm.amdgcn.workitem.id.x()
336  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
337  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
338  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
339  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
340  store <2 x half> %fabs, ptr addrspace(1) %gep.out
341  ret void
342}
343
344define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
345; CI-LABEL: fabs_free_v2f16:
346; CI:       ; %bb.0:
347; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
348; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
349; CI-NEXT:    s_waitcnt lgkmcnt(0)
350; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
351; CI-NEXT:    v_mov_b32_e32 v0, s0
352; CI-NEXT:    v_mov_b32_e32 v1, s1
353; CI-NEXT:    v_mov_b32_e32 v2, s2
354; CI-NEXT:    flat_store_dword v[0:1], v2
355; CI-NEXT:    s_endpgm
356;
357; VI-LABEL: fabs_free_v2f16:
358; VI:       ; %bb.0:
359; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
360; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
363; VI-NEXT:    v_mov_b32_e32 v0, s0
364; VI-NEXT:    v_mov_b32_e32 v1, s1
365; VI-NEXT:    v_mov_b32_e32 v2, s2
366; VI-NEXT:    flat_store_dword v[0:1], v2
367; VI-NEXT:    s_endpgm
368;
369; GFX9-LABEL: fabs_free_v2f16:
370; GFX9:       ; %bb.0:
371; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
372; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
373; GFX9-NEXT:    v_mov_b32_e32 v0, 0
374; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
376; GFX9-NEXT:    v_mov_b32_e32 v1, s2
377; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
378; GFX9-NEXT:    s_endpgm
379;
380; GFX11-LABEL: fabs_free_v2f16:
381; GFX11:       ; %bb.0:
382; GFX11-NEXT:    s_clause 0x1
383; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
384; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
385; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
387; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
388; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
389; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
390; GFX11-NEXT:    s_endpgm
391  %bc = bitcast i32 %in to <2 x half>
392  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
393  store <2 x half> %fabs, ptr addrspace(1) %out
394  ret void
395}
396
397; FIXME: Should do fabs after conversion to avoid converting multiple
398; times in this particular case.
399define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
400; CI-LABEL: v_fabs_fold_self_v2f16:
401; CI:       ; %bb.0:
402; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
403; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
404; CI-NEXT:    s_waitcnt lgkmcnt(0)
405; CI-NEXT:    v_mov_b32_e32 v1, s3
406; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
407; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
408; CI-NEXT:    flat_load_dword v0, v[0:1]
409; CI-NEXT:    s_waitcnt vmcnt(0)
410; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
411; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
412; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
413; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
414; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
415; CI-NEXT:    v_mul_f32_e32 v1, v1, v2
416; CI-NEXT:    v_cvt_f16_f32_e32 v2, v1
417; CI-NEXT:    v_mul_f32_e32 v0, v0, v3
418; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
419; CI-NEXT:    v_mov_b32_e32 v0, s0
420; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
421; CI-NEXT:    v_mov_b32_e32 v1, s1
422; CI-NEXT:    v_or_b32_e32 v2, v3, v2
423; CI-NEXT:    flat_store_dword v[0:1], v2
424; CI-NEXT:    s_endpgm
425;
426; VI-LABEL: v_fabs_fold_self_v2f16:
427; VI:       ; %bb.0:
428; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
429; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
430; VI-NEXT:    s_waitcnt lgkmcnt(0)
431; VI-NEXT:    v_mov_b32_e32 v1, s3
432; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
433; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
434; VI-NEXT:    flat_load_dword v2, v[0:1]
435; VI-NEXT:    v_mov_b32_e32 v0, s0
436; VI-NEXT:    v_mov_b32_e32 v1, s1
437; VI-NEXT:    s_waitcnt vmcnt(0)
438; VI-NEXT:    v_mul_f16_sdwa v3, |v2|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
439; VI-NEXT:    v_mul_f16_e64 v2, |v2|, v2
440; VI-NEXT:    v_or_b32_e32 v2, v2, v3
441; VI-NEXT:    flat_store_dword v[0:1], v2
442; VI-NEXT:    s_endpgm
443;
444; GFX9-LABEL: v_fabs_fold_self_v2f16:
445; GFX9:       ; %bb.0:
446; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
447; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
448; GFX9-NEXT:    v_mov_b32_e32 v1, 0
449; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
451; GFX9-NEXT:    s_waitcnt vmcnt(0)
452; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v0
453; GFX9-NEXT:    v_pk_mul_f16 v0, v2, v0
454; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
455; GFX9-NEXT:    s_endpgm
456;
457; GFX11-LABEL: v_fabs_fold_self_v2f16:
458; GFX11:       ; %bb.0:
459; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
460; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
461; GFX11-NEXT:    v_mov_b32_e32 v2, 0
462; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
463; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
464; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
466; GFX11-NEXT:    s_waitcnt vmcnt(0)
467; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
468; GFX11-NEXT:    v_pk_mul_f16 v0, v1, v0
469; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
470; GFX11-NEXT:    s_endpgm
471  %tid = call i32 @llvm.amdgcn.workitem.id.x()
472  %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
473  %val = load <2 x half>, ptr addrspace(1) %gep
474  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
475  %fmul = fmul <2 x half> %fabs, %val
476  store <2 x half> %fmul, ptr addrspace(1) %out
477  ret void
478}
479
480define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 {
481; CI-LABEL: v_fabs_fold_v2f16:
482; CI:       ; %bb.0:
483; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
484; CI-NEXT:    s_load_dword s4, s[8:9], 0x4
485; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
486; CI-NEXT:    s_waitcnt lgkmcnt(0)
487; CI-NEXT:    v_mov_b32_e32 v1, s3
488; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
489; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
490; CI-NEXT:    flat_load_dword v0, v[0:1]
491; CI-NEXT:    s_lshr_b32 s2, s4, 16
492; CI-NEXT:    v_cvt_f32_f16_e32 v1, s2
493; CI-NEXT:    v_cvt_f32_f16_e32 v3, s4
494; CI-NEXT:    s_waitcnt vmcnt(0)
495; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
496; CI-NEXT:    v_cvt_f32_f16_e64 v2, |v2|
497; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
498; CI-NEXT:    v_mul_f32_e32 v1, v2, v1
499; CI-NEXT:    v_cvt_f16_f32_e32 v2, v1
500; CI-NEXT:    v_mul_f32_e32 v0, v0, v3
501; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
502; CI-NEXT:    v_mov_b32_e32 v0, s0
503; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
504; CI-NEXT:    v_mov_b32_e32 v1, s1
505; CI-NEXT:    v_or_b32_e32 v2, v3, v2
506; CI-NEXT:    flat_store_dword v[0:1], v2
507; CI-NEXT:    s_endpgm
508;
509; VI-LABEL: v_fabs_fold_v2f16:
510; VI:       ; %bb.0:
511; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
512; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
513; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
514; VI-NEXT:    s_waitcnt lgkmcnt(0)
515; VI-NEXT:    v_mov_b32_e32 v1, s3
516; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
517; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
518; VI-NEXT:    flat_load_dword v2, v[0:1]
519; VI-NEXT:    v_mov_b32_e32 v0, s0
520; VI-NEXT:    s_lshr_b32 s0, s4, 16
521; VI-NEXT:    v_mov_b32_e32 v3, s0
522; VI-NEXT:    v_mov_b32_e32 v1, s1
523; VI-NEXT:    s_waitcnt vmcnt(0)
524; VI-NEXT:    v_mul_f16_sdwa v3, |v2|, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
525; VI-NEXT:    v_mul_f16_e64 v2, |v2|, s4
526; VI-NEXT:    v_or_b32_e32 v2, v2, v3
527; VI-NEXT:    flat_store_dword v[0:1], v2
528; VI-NEXT:    s_endpgm
529;
530; GFX9-LABEL: v_fabs_fold_v2f16:
531; GFX9:       ; %bb.0:
532; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
533; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
534; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
535; GFX9-NEXT:    v_mov_b32_e32 v1, 0
536; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
538; GFX9-NEXT:    s_waitcnt vmcnt(0)
539; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
540; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s4
541; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
542; GFX9-NEXT:    s_endpgm
543;
544; GFX11-LABEL: v_fabs_fold_v2f16:
545; GFX11:       ; %bb.0:
546; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
547; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
548; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x10
549; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
550; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
551; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
553; GFX11-NEXT:    s_waitcnt vmcnt(0)
554; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
555; GFX11-NEXT:    v_pk_mul_f16 v0, v0, s4
556; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
557; GFX11-NEXT:    s_endpgm
558  %tid = call i32 @llvm.amdgcn.workitem.id.x()
559  %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
560  %val = load <2 x half>, ptr addrspace(1) %gep
561  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
562  %other.val.cvt = bitcast i32 %other.val to <2 x half>
563  %fmul = fmul <2 x half> %fabs, %other.val.cvt
564  store <2 x half> %fmul, ptr addrspace(1) %out
565  ret void
566}
567
568define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
569; CI-LABEL: v_extract_fabs_fold_v2f16:
570; CI:       ; %bb.0:
571; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
572; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
573; CI-NEXT:    s_waitcnt lgkmcnt(0)
574; CI-NEXT:    v_mov_b32_e32 v1, s1
575; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
576; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
577; CI-NEXT:    flat_load_dword v0, v[0:1]
578; CI-NEXT:    s_waitcnt vmcnt(0)
579; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
580; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
581; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
582; CI-NEXT:    v_mul_f32_e32 v0, 4.0, v0
583; CI-NEXT:    v_add_f32_e32 v1, 2.0, v1
584; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
585; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
586; CI-NEXT:    flat_store_short v[0:1], v0
587; CI-NEXT:    s_waitcnt vmcnt(0)
588; CI-NEXT:    flat_store_short v[0:1], v1
589; CI-NEXT:    s_waitcnt vmcnt(0)
590; CI-NEXT:    s_endpgm
591;
592; VI-LABEL: v_extract_fabs_fold_v2f16:
593; VI:       ; %bb.0:
594; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
595; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
596; VI-NEXT:    s_waitcnt lgkmcnt(0)
597; VI-NEXT:    v_mov_b32_e32 v1, s1
598; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
599; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
600; VI-NEXT:    flat_load_dword v0, v[0:1]
601; VI-NEXT:    v_mov_b32_e32 v1, 0x4000
602; VI-NEXT:    s_waitcnt vmcnt(0)
603; VI-NEXT:    v_mul_f16_e64 v2, |v0|, 4.0
604; VI-NEXT:    v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
605; VI-NEXT:    flat_store_short v[0:1], v2
606; VI-NEXT:    s_waitcnt vmcnt(0)
607; VI-NEXT:    flat_store_short v[0:1], v0
608; VI-NEXT:    s_waitcnt vmcnt(0)
609; VI-NEXT:    s_endpgm
610;
611; GFX9-LABEL: v_extract_fabs_fold_v2f16:
612; GFX9:       ; %bb.0:
613; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
614; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
615; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4000
616; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
618; GFX9-NEXT:    s_waitcnt vmcnt(0)
619; GFX9-NEXT:    v_mul_f16_e64 v2, |v0|, 4.0
620; GFX9-NEXT:    v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
621; GFX9-NEXT:    global_store_short v[0:1], v2, off
622; GFX9-NEXT:    s_waitcnt vmcnt(0)
623; GFX9-NEXT:    global_store_short v[0:1], v0, off
624; GFX9-NEXT:    s_waitcnt vmcnt(0)
625; GFX9-NEXT:    s_endpgm
626;
627; GFX11-LABEL: v_extract_fabs_fold_v2f16:
628; GFX11:       ; %bb.0:
629; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
630; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
631; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
632; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
633; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
635; GFX11-NEXT:    s_waitcnt vmcnt(0)
636; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
637; GFX11-NEXT:    v_mul_f16_e64 v0, |v0|, 4.0
638; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
639; GFX11-NEXT:    v_add_f16_e64 v1, |v1|, 2.0
640; GFX11-NEXT:    global_store_b16 v[0:1], v0, off dlc
641; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
642; GFX11-NEXT:    global_store_b16 v[0:1], v1, off dlc
643; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
644; GFX11-NEXT:    s_endpgm
645  %tid = call i32 @llvm.amdgcn.workitem.id.x()
646  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
647  %val = load <2 x half>, ptr addrspace(1) %gep.in
648  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
649  %elt0 = extractelement <2 x half> %fabs, i32 0
650  %elt1 = extractelement <2 x half> %fabs, i32 1
651
652  %fmul0 = fmul half %elt0, 4.0
653  %fadd1 = fadd half %elt1, 2.0
654  store volatile half %fmul0, ptr addrspace(1) undef
655  store volatile half %fadd1, ptr addrspace(1) undef
656  ret void
657}
658
659define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 {
660; CI-LABEL: v_extract_fabs_no_fold_v2f16:
661; CI:       ; %bb.0:
662; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
663; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
664; CI-NEXT:    s_waitcnt lgkmcnt(0)
665; CI-NEXT:    v_mov_b32_e32 v1, s1
666; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
667; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
668; CI-NEXT:    flat_load_dword v0, v[0:1]
669; CI-NEXT:    s_waitcnt vmcnt(0)
670; CI-NEXT:    v_bfe_u32 v1, v0, 16, 15
671; CI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
672; CI-NEXT:    flat_store_short v[0:1], v0
673; CI-NEXT:    s_waitcnt vmcnt(0)
674; CI-NEXT:    flat_store_short v[0:1], v1
675; CI-NEXT:    s_waitcnt vmcnt(0)
676; CI-NEXT:    s_endpgm
677;
678; VI-LABEL: v_extract_fabs_no_fold_v2f16:
679; VI:       ; %bb.0:
680; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
681; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
682; VI-NEXT:    s_waitcnt lgkmcnt(0)
683; VI-NEXT:    v_mov_b32_e32 v1, s1
684; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
685; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
686; VI-NEXT:    flat_load_dword v0, v[0:1]
687; VI-NEXT:    s_waitcnt vmcnt(0)
688; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
689; VI-NEXT:    v_bfe_u32 v0, v0, 16, 15
690; VI-NEXT:    flat_store_short v[0:1], v1
691; VI-NEXT:    s_waitcnt vmcnt(0)
692; VI-NEXT:    flat_store_short v[0:1], v0
693; VI-NEXT:    s_waitcnt vmcnt(0)
694; VI-NEXT:    s_endpgm
695;
696; GFX9-LABEL: v_extract_fabs_no_fold_v2f16:
697; GFX9:       ; %bb.0:
698; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
699; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
700; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
702; GFX9-NEXT:    s_waitcnt vmcnt(0)
703; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
704; GFX9-NEXT:    global_store_short v[0:1], v0, off
705; GFX9-NEXT:    s_waitcnt vmcnt(0)
706; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v0, off
707; GFX9-NEXT:    s_waitcnt vmcnt(0)
708; GFX9-NEXT:    s_endpgm
709;
710; GFX11-LABEL: v_extract_fabs_no_fold_v2f16:
711; GFX11:       ; %bb.0:
712; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
713; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
715; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
716; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
718; GFX11-NEXT:    s_waitcnt vmcnt(0)
719; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
720; GFX11-NEXT:    global_store_b16 v[0:1], v0, off dlc
721; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
722; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off dlc
723; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
724; GFX11-NEXT:    s_endpgm
725  %tid = call i32 @llvm.amdgcn.workitem.id.x()
726  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
727  %val = load <2 x half>, ptr addrspace(1) %gep.in
728  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
729  %elt0 = extractelement <2 x half> %fabs, i32 0
730  %elt1 = extractelement <2 x half> %fabs, i32 1
731  store volatile half %elt0, ptr addrspace(1) undef
732  store volatile half %elt1, ptr addrspace(1) undef
733  ret void
734}
735
736declare half @llvm.fabs.f16(half) #1
737declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
738declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
739declare i32 @llvm.amdgcn.workitem.id.x() #1
740
741attributes #0 = { nounwind }
742attributes #1 = { nounwind readnone }
743
744