xref: /llvm-project/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6
7define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
8; SI-LABEL: v_clamp_add_src_f32:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
11; SI-NEXT:    s_mov_b32 s7, 0xf000
12; SI-NEXT:    s_mov_b32 s6, 0
13; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
14; SI-NEXT:    v_mov_b32_e32 v1, 0
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
17; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
18; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
19; SI-NEXT:    s_waitcnt vmcnt(0)
20; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
21; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
22; SI-NEXT:    s_endpgm
23;
24; GFX8-LABEL: v_clamp_add_src_f32:
25; GFX8:       ; %bb.0:
26; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
27; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
28; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX8-NEXT:    v_mov_b32_e32 v1, s3
30; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
31; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
32; GFX8-NEXT:    flat_load_dword v3, v[0:1]
33; GFX8-NEXT:    v_mov_b32_e32 v1, s1
34; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
35; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
36; GFX8-NEXT:    s_waitcnt vmcnt(0)
37; GFX8-NEXT:    v_add_f32_e64 v2, v3, 1.0 clamp
38; GFX8-NEXT:    flat_store_dword v[0:1], v2
39; GFX8-NEXT:    s_endpgm
40;
41; GFX9-LABEL: v_clamp_add_src_f32:
42; GFX9:       ; %bb.0:
43; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
44; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
47; GFX9-NEXT:    s_waitcnt vmcnt(0)
48; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
49; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
50; GFX9-NEXT:    s_endpgm
51;
52; GFX11-LABEL: v_clamp_add_src_f32:
53; GFX11:       ; %bb.0:
54; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
55; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
56; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
57; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
58; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
60; GFX11-NEXT:    s_waitcnt vmcnt(0)
61; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
62; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
63; GFX11-NEXT:    s_endpgm
64  %tid = call i32 @llvm.amdgcn.workitem.id.x()
65  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
66  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
67  %a = load float, ptr addrspace(1) %gep0
68  %add = fadd float %a, 1.0
69  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
70  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
71  store float %clamp, ptr addrspace(1) %out.gep
72  ret void
73}
74
75define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
76; SI-LABEL: v_clamp_multi_use_src_f32:
77; SI:       ; %bb.0:
78; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
79; SI-NEXT:    s_mov_b32 s6, 0
80; SI-NEXT:    s_mov_b32 s7, 0xf000
81; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
82; SI-NEXT:    v_mov_b32_e32 v1, 0
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
85; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
86; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
87; SI-NEXT:    s_mov_b32 s6, -1
88; SI-NEXT:    s_waitcnt vmcnt(0)
89; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
90; SI-NEXT:    v_max_f32_e64 v3, v2, v2 clamp
91; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
92; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
93; SI-NEXT:    s_waitcnt vmcnt(0)
94; SI-NEXT:    s_endpgm
95;
96; GFX8-LABEL: v_clamp_multi_use_src_f32:
97; GFX8:       ; %bb.0:
98; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
99; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
100; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX8-NEXT:    v_mov_b32_e32 v1, s3
102; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
103; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
104; GFX8-NEXT:    flat_load_dword v3, v[0:1]
105; GFX8-NEXT:    v_mov_b32_e32 v1, s1
106; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
107; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
108; GFX8-NEXT:    s_waitcnt vmcnt(0)
109; GFX8-NEXT:    v_add_f32_e32 v2, 1.0, v3
110; GFX8-NEXT:    v_max_f32_e64 v3, v2, v2 clamp
111; GFX8-NEXT:    flat_store_dword v[0:1], v3
112; GFX8-NEXT:    flat_store_dword v[0:1], v2
113; GFX8-NEXT:    s_waitcnt vmcnt(0)
114; GFX8-NEXT:    s_endpgm
115;
116; GFX9-LABEL: v_clamp_multi_use_src_f32:
117; GFX9:       ; %bb.0:
118; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
119; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
120; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
122; GFX9-NEXT:    s_waitcnt vmcnt(0)
123; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
124; GFX9-NEXT:    v_max_f32_e64 v2, v1, v1 clamp
125; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
126; GFX9-NEXT:    global_store_dword v[0:1], v1, off
127; GFX9-NEXT:    s_waitcnt vmcnt(0)
128; GFX9-NEXT:    s_endpgm
129;
130; GFX11-LABEL: v_clamp_multi_use_src_f32:
131; GFX11:       ; %bb.0:
132; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
133; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
134; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
135; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
136; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
138; GFX11-NEXT:    s_waitcnt vmcnt(0)
139; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
140; GFX11-NEXT:    v_max_f32_e64 v2, v1, v1 clamp
141; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
142; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
143; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
144; GFX11-NEXT:    s_endpgm
145  %tid = call i32 @llvm.amdgcn.workitem.id.x()
146  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
147  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
148  %a = load float, ptr addrspace(1) %gep0
149  %add = fadd float %a, 1.0
150  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
151  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
152  store float %clamp, ptr addrspace(1) %out.gep
153  store volatile float %add, ptr addrspace(1) undef
154  ret void
155}
156
157define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
158; SI-LABEL: v_clamp_dbg_use_src_f32:
159; SI:       ; %bb.0:
160; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
161; SI-NEXT:    s_mov_b32 s7, 0xf000
162; SI-NEXT:    s_mov_b32 s6, 0
163; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
164; SI-NEXT:    v_mov_b32_e32 v1, 0
165; SI-NEXT:    s_waitcnt lgkmcnt(0)
166; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
167; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
168; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
169; SI-NEXT:    s_waitcnt vmcnt(0)
170; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
171; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
172; SI-NEXT:    s_endpgm
173;
174; GFX8-LABEL: v_clamp_dbg_use_src_f32:
175; GFX8:       ; %bb.0:
176; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
177; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
178; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX8-NEXT:    v_mov_b32_e32 v1, s3
180; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
181; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
182; GFX8-NEXT:    flat_load_dword v3, v[0:1]
183; GFX8-NEXT:    v_mov_b32_e32 v1, s1
184; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
185; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
186; GFX8-NEXT:    s_waitcnt vmcnt(0)
187; GFX8-NEXT:    v_add_f32_e64 v2, v3, 1.0 clamp
188; GFX8-NEXT:    flat_store_dword v[0:1], v2
189; GFX8-NEXT:    s_endpgm
190;
191; GFX9-LABEL: v_clamp_dbg_use_src_f32:
192; GFX9:       ; %bb.0:
193; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
194; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
195; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
197; GFX9-NEXT:    s_waitcnt vmcnt(0)
198; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
199; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
200; GFX9-NEXT:    s_endpgm
201;
202; GFX11-LABEL: v_clamp_dbg_use_src_f32:
203; GFX11:       ; %bb.0:
204; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
205; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
206; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
207; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
208; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
210; GFX11-NEXT:    s_waitcnt vmcnt(0)
211; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
212; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
213; GFX11-NEXT:    s_endpgm
214  %tid = call i32 @llvm.amdgcn.workitem.id.x()
215  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
216  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
217  %a = load float, ptr addrspace(1) %gep0
218  %add = fadd float %a, 1.0
219  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
220  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
221  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
222  store float %clamp, ptr addrspace(1) %out.gep
223  ret void
224}
225
226define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
227; SI-LABEL: v_clamp_add_neg_src_f32:
228; SI:       ; %bb.0:
229; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
230; SI-NEXT:    s_mov_b32 s7, 0xf000
231; SI-NEXT:    s_mov_b32 s6, 0
232; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
233; SI-NEXT:    v_mov_b32_e32 v1, 0
234; SI-NEXT:    s_waitcnt lgkmcnt(0)
235; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
236; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
237; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
238; SI-NEXT:    s_waitcnt vmcnt(0)
239; SI-NEXT:    v_floor_f32_e32 v2, v2
240; SI-NEXT:    v_max_f32_e64 v2, -v2, -v2 clamp
241; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
242; SI-NEXT:    s_endpgm
243;
244; GFX8-LABEL: v_clamp_add_neg_src_f32:
245; GFX8:       ; %bb.0:
246; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
247; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
248; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX8-NEXT:    v_mov_b32_e32 v1, s3
250; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
251; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
252; GFX8-NEXT:    flat_load_dword v3, v[0:1]
253; GFX8-NEXT:    v_mov_b32_e32 v1, s1
254; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
255; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
256; GFX8-NEXT:    s_waitcnt vmcnt(0)
257; GFX8-NEXT:    v_floor_f32_e32 v2, v3
258; GFX8-NEXT:    v_max_f32_e64 v2, -v2, -v2 clamp
259; GFX8-NEXT:    flat_store_dword v[0:1], v2
260; GFX8-NEXT:    s_endpgm
261;
262; GFX9-LABEL: v_clamp_add_neg_src_f32:
263; GFX9:       ; %bb.0:
264; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
265; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
268; GFX9-NEXT:    s_waitcnt vmcnt(0)
269; GFX9-NEXT:    v_floor_f32_e32 v1, v1
270; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
271; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
272; GFX9-NEXT:    s_endpgm
273;
274; GFX11-LABEL: v_clamp_add_neg_src_f32:
275; GFX11:       ; %bb.0:
276; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
277; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
278; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
279; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
280; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
282; GFX11-NEXT:    s_waitcnt vmcnt(0)
283; GFX11-NEXT:    v_floor_f32_e32 v1, v1
284; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
285; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
286; GFX11-NEXT:    s_endpgm
287  %tid = call i32 @llvm.amdgcn.workitem.id.x()
288  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
289  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
290  %a = load float, ptr addrspace(1) %gep0
291  %floor = call float @llvm.floor.f32(float %a)
292  %neg.floor = fneg float %floor
293  %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
294  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
295  store float %clamp, ptr addrspace(1) %out.gep
296  ret void
297}
298
299define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
300; SI-LABEL: v_non_clamp_max_f32:
301; SI:       ; %bb.0:
302; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
303; SI-NEXT:    s_mov_b32 s7, 0xf000
304; SI-NEXT:    s_mov_b32 s6, 0
305; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
306; SI-NEXT:    v_mov_b32_e32 v1, 0
307; SI-NEXT:    s_waitcnt lgkmcnt(0)
308; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
309; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
310; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
311; SI-NEXT:    s_waitcnt vmcnt(0)
312; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
313; SI-NEXT:    v_max_f32_e32 v2, 0, v2
314; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
315; SI-NEXT:    s_endpgm
316;
317; GFX8-LABEL: v_non_clamp_max_f32:
318; GFX8:       ; %bb.0:
319; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
320; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
321; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX8-NEXT:    v_mov_b32_e32 v1, s3
323; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
324; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
325; GFX8-NEXT:    flat_load_dword v3, v[0:1]
326; GFX8-NEXT:    v_mov_b32_e32 v1, s1
327; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
328; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
329; GFX8-NEXT:    s_waitcnt vmcnt(0)
330; GFX8-NEXT:    v_add_f32_e32 v2, 1.0, v3
331; GFX8-NEXT:    v_max_f32_e32 v2, 0, v2
332; GFX8-NEXT:    flat_store_dword v[0:1], v2
333; GFX8-NEXT:    s_endpgm
334;
335; GFX9-LABEL: v_non_clamp_max_f32:
336; GFX9:       ; %bb.0:
337; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
338; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
339; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
341; GFX9-NEXT:    s_waitcnt vmcnt(0)
342; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
343; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
344; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
345; GFX9-NEXT:    s_endpgm
346;
347; GFX11-LABEL: v_non_clamp_max_f32:
348; GFX11:       ; %bb.0:
349; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
350; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
351; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
352; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
353; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
355; GFX11-NEXT:    s_waitcnt vmcnt(0)
356; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
357; GFX11-NEXT:    v_max_f32_e32 v1, 0, v1
358; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
359; GFX11-NEXT:    s_endpgm
360  %tid = call i32 @llvm.amdgcn.workitem.id.x()
361  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
362  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
363  %a = load float, ptr addrspace(1) %gep0
364  %add = fadd float %a, 1.0
365  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
366  store float %max, ptr addrspace(1) %out.gep
367  ret void
368}
369
370define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
371; SI-LABEL: v_clamp_add_src_f32_denormals:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
374; SI-NEXT:    s_mov_b32 s7, 0xf000
375; SI-NEXT:    s_mov_b32 s6, 0
376; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
377; SI-NEXT:    v_mov_b32_e32 v1, 0
378; SI-NEXT:    s_waitcnt lgkmcnt(0)
379; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
380; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
381; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
382; SI-NEXT:    s_waitcnt vmcnt(0)
383; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
384; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
385; SI-NEXT:    s_endpgm
386;
387; GFX8-LABEL: v_clamp_add_src_f32_denormals:
388; GFX8:       ; %bb.0:
389; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
390; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
391; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX8-NEXT:    v_mov_b32_e32 v1, s3
393; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
394; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
395; GFX8-NEXT:    flat_load_dword v3, v[0:1]
396; GFX8-NEXT:    v_mov_b32_e32 v1, s1
397; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
398; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
399; GFX8-NEXT:    s_waitcnt vmcnt(0)
400; GFX8-NEXT:    v_add_f32_e64 v2, v3, 1.0 clamp
401; GFX8-NEXT:    flat_store_dword v[0:1], v2
402; GFX8-NEXT:    s_endpgm
403;
404; GFX9-LABEL: v_clamp_add_src_f32_denormals:
405; GFX9:       ; %bb.0:
406; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
407; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
408; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
410; GFX9-NEXT:    s_waitcnt vmcnt(0)
411; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
412; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
413; GFX9-NEXT:    s_endpgm
414;
415; GFX11-LABEL: v_clamp_add_src_f32_denormals:
416; GFX11:       ; %bb.0:
417; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
418; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
419; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
420; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
421; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
423; GFX11-NEXT:    s_waitcnt vmcnt(0)
424; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
425; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
426; GFX11-NEXT:    s_endpgm
427  %tid = call i32 @llvm.amdgcn.workitem.id.x()
428  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
429  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
430  %a = load float, ptr addrspace(1) %gep0
431  %add = fadd float %a, 1.0
432  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
433  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
434  store float %clamp, ptr addrspace(1) %out.gep
435  ret void
436}
437
438define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
439; SI-LABEL: v_clamp_add_src_f16_denorm:
440; SI:       ; %bb.0:
441; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
442; SI-NEXT:    s_mov_b32 s7, 0xf000
443; SI-NEXT:    s_mov_b32 s6, 0
444; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
445; SI-NEXT:    v_mov_b32_e32 v1, 0
446; SI-NEXT:    s_waitcnt lgkmcnt(0)
447; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
448; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
449; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
450; SI-NEXT:    s_waitcnt vmcnt(0)
451; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
452; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
453; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
454; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
455; SI-NEXT:    s_endpgm
456;
457; GFX8-LABEL: v_clamp_add_src_f16_denorm:
458; GFX8:       ; %bb.0:
459; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
460; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
461; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX8-NEXT:    v_mov_b32_e32 v1, s3
463; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
464; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
465; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
466; GFX8-NEXT:    v_mov_b32_e32 v1, s1
467; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
468; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
469; GFX8-NEXT:    s_waitcnt vmcnt(0)
470; GFX8-NEXT:    v_add_f16_e64 v2, v3, 1.0 clamp
471; GFX8-NEXT:    flat_store_short v[0:1], v2
472; GFX8-NEXT:    s_endpgm
473;
474; GFX9-LABEL: v_clamp_add_src_f16_denorm:
475; GFX9:       ; %bb.0:
476; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
477; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
478; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
480; GFX9-NEXT:    s_waitcnt vmcnt(0)
481; GFX9-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
482; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
483; GFX9-NEXT:    s_endpgm
484;
485; GFX11-LABEL: v_clamp_add_src_f16_denorm:
486; GFX11:       ; %bb.0:
487; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
488; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
489; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
490; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
491; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
493; GFX11-NEXT:    s_waitcnt vmcnt(0)
494; GFX11-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
495; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
496; GFX11-NEXT:    s_endpgm
497  %tid = call i32 @llvm.amdgcn.workitem.id.x()
498  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
499  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
500  %a = load half, ptr addrspace(1) %gep0
501  %add = fadd half %a, 1.0
502  %max = call half @llvm.maxnum.f16(half %add, half 0.0)
503  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
504  store half %clamp, ptr addrspace(1) %out.gep
505  ret void
506}
507
508define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 {
509; SI-LABEL: v_clamp_add_src_f16_no_denormals:
510; SI:       ; %bb.0:
511; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
512; SI-NEXT:    s_mov_b32 s7, 0xf000
513; SI-NEXT:    s_mov_b32 s6, 0
514; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
515; SI-NEXT:    v_mov_b32_e32 v1, 0
516; SI-NEXT:    s_waitcnt lgkmcnt(0)
517; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
518; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
519; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
520; SI-NEXT:    s_waitcnt vmcnt(0)
521; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
522; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
523; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
524; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
525; SI-NEXT:    s_endpgm
526;
527; GFX8-LABEL: v_clamp_add_src_f16_no_denormals:
528; GFX8:       ; %bb.0:
529; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
530; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
531; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX8-NEXT:    v_mov_b32_e32 v1, s3
533; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
534; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
535; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
536; GFX8-NEXT:    v_mov_b32_e32 v1, s1
537; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
538; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
539; GFX8-NEXT:    s_waitcnt vmcnt(0)
540; GFX8-NEXT:    v_add_f16_e64 v2, v3, 1.0 clamp
541; GFX8-NEXT:    flat_store_short v[0:1], v2
542; GFX8-NEXT:    s_endpgm
543;
544; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
545; GFX9:       ; %bb.0:
546; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
547; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
548; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
550; GFX9-NEXT:    s_waitcnt vmcnt(0)
551; GFX9-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
552; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
553; GFX9-NEXT:    s_endpgm
554;
555; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
556; GFX11:       ; %bb.0:
557; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
558; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
559; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
560; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
561; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
563; GFX11-NEXT:    s_waitcnt vmcnt(0)
564; GFX11-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
565; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
566; GFX11-NEXT:    s_endpgm
567  %tid = call i32 @llvm.amdgcn.workitem.id.x()
568  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
569  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
570  %a = load half, ptr addrspace(1) %gep0
571  %add = fadd half %a, 1.0
572  %max = call half @llvm.maxnum.f16(half %add, half 0.0)
573  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
574  store half %clamp, ptr addrspace(1) %out.gep
575  ret void
576}
577
578define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
579; SI-LABEL: v_clamp_add_src_v2f32:
580; SI:       ; %bb.0:
581; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
582; SI-NEXT:    s_mov_b32 s7, 0xf000
583; SI-NEXT:    s_mov_b32 s6, 0
584; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
585; SI-NEXT:    v_mov_b32_e32 v1, 0
586; SI-NEXT:    s_waitcnt lgkmcnt(0)
587; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
588; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
589; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
590; SI-NEXT:    s_waitcnt vmcnt(0)
591; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
592; SI-NEXT:    v_add_f32_e64 v3, v3, 1.0 clamp
593; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
594; SI-NEXT:    s_endpgm
595;
596; GFX8-LABEL: v_clamp_add_src_v2f32:
597; GFX8:       ; %bb.0:
598; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
599; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX8-NEXT:    v_mov_b32_e32 v1, s3
602; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
603; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
604; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
605; GFX8-NEXT:    v_mov_b32_e32 v3, s1
606; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
607; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
608; GFX8-NEXT:    s_waitcnt vmcnt(0)
609; GFX8-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
610; GFX8-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
611; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
612; GFX8-NEXT:    s_endpgm
613;
614; GFX9-LABEL: v_clamp_add_src_v2f32:
615; GFX9:       ; %bb.0:
616; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
617; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
620; GFX9-NEXT:    s_waitcnt vmcnt(0)
621; GFX9-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
622; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
623; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
624; GFX9-NEXT:    s_endpgm
625;
626; GFX11-LABEL: v_clamp_add_src_v2f32:
627; GFX11:       ; %bb.0:
628; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
629; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
630; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
631; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
632; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
634; GFX11-NEXT:    s_waitcnt vmcnt(0)
635; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
636; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
637; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
638; GFX11-NEXT:    s_endpgm
639  %tid = call i32 @llvm.amdgcn.workitem.id.x()
640  %gep0 = getelementptr <2 x float>, ptr addrspace(1) %aptr, i32 %tid
641  %out.gep = getelementptr <2 x float>, ptr addrspace(1) %out, i32 %tid
642  %a = load <2 x float>, ptr addrspace(1) %gep0
643  %add = fadd <2 x float> %a, <float 1.0, float 1.0>
644  %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
645  %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
646  store <2 x float> %clamp, ptr addrspace(1) %out.gep
647  ret void
648}
649
650define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
651; SI-LABEL: v_clamp_add_src_f64:
652; SI:       ; %bb.0:
653; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
654; SI-NEXT:    s_mov_b32 s7, 0xf000
655; SI-NEXT:    s_mov_b32 s6, 0
656; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
657; SI-NEXT:    v_mov_b32_e32 v1, 0
658; SI-NEXT:    s_waitcnt lgkmcnt(0)
659; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
660; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
661; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
662; SI-NEXT:    s_waitcnt vmcnt(0)
663; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0 clamp
664; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
665; SI-NEXT:    s_endpgm
666;
667; GFX8-LABEL: v_clamp_add_src_f64:
668; GFX8:       ; %bb.0:
669; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
670; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
671; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX8-NEXT:    v_mov_b32_e32 v1, s3
673; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
674; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
675; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
676; GFX8-NEXT:    v_mov_b32_e32 v3, s1
677; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
678; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
679; GFX8-NEXT:    s_waitcnt vmcnt(0)
680; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 clamp
681; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
682; GFX8-NEXT:    s_endpgm
683;
684; GFX9-LABEL: v_clamp_add_src_f64:
685; GFX9:       ; %bb.0:
686; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
687; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
688; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
690; GFX9-NEXT:    s_waitcnt vmcnt(0)
691; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 clamp
692; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
693; GFX9-NEXT:    s_endpgm
694;
695; GFX11-LABEL: v_clamp_add_src_f64:
696; GFX11:       ; %bb.0:
697; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
698; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
699; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
700; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
701; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
703; GFX11-NEXT:    s_waitcnt vmcnt(0)
704; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 clamp
705; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
706; GFX11-NEXT:    s_endpgm
707  %tid = call i32 @llvm.amdgcn.workitem.id.x()
708  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
709  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
710  %a = load double, ptr addrspace(1) %gep0
711  %add = fadd double %a, 1.0
712  %max = call double @llvm.maxnum.f64(double %add, double 0.0)
713  %clamp = call double @llvm.minnum.f64(double %max, double 1.0)
714  store double %clamp, ptr addrspace(1) %out.gep
715  ret void
716}
717
718define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspace(1) %aptr, float %a) #0 {
719; SI-LABEL: v_clamp_mac_to_mad:
720; SI:       ; %bb.0:
721; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
722; SI-NEXT:    s_load_dword s8, s[4:5], 0xd
723; SI-NEXT:    s_mov_b32 s7, 0xf000
724; SI-NEXT:    s_mov_b32 s6, 0
725; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
726; SI-NEXT:    s_waitcnt lgkmcnt(0)
727; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
728; SI-NEXT:    v_mov_b32_e32 v1, 0
729; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
730; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
731; SI-NEXT:    s_waitcnt vmcnt(0)
732; SI-NEXT:    v_mad_f32 v3, s8, s8, v2 clamp
733; SI-NEXT:    v_add_f32_e32 v2, v3, v2
734; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
735; SI-NEXT:    s_endpgm
736;
737; GFX8-LABEL: v_clamp_mac_to_mad:
738; GFX8:       ; %bb.0:
739; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
740; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x34
741; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
742; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
743; GFX8-NEXT:    v_mov_b32_e32 v1, s3
744; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
745; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
746; GFX8-NEXT:    flat_load_dword v3, v[0:1]
747; GFX8-NEXT:    v_mov_b32_e32 v1, s1
748; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
749; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
750; GFX8-NEXT:    s_waitcnt vmcnt(0)
751; GFX8-NEXT:    v_mad_f32 v2, s4, s4, v3 clamp
752; GFX8-NEXT:    v_add_f32_e32 v2, v2, v3
753; GFX8-NEXT:    flat_store_dword v[0:1], v2
754; GFX8-NEXT:    s_endpgm
755;
756; GFX9-LABEL: v_clamp_mac_to_mad:
757; GFX9:       ; %bb.0:
758; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
759; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
760; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
763; GFX9-NEXT:    s_waitcnt vmcnt(0)
764; GFX9-NEXT:    v_mad_f32 v2, s6, s6, v1 clamp
765; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
766; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
767; GFX9-NEXT:    s_endpgm
768;
769; GFX11-LABEL: v_clamp_mac_to_mad:
770; GFX11:       ; %bb.0:
771; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
772; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
773; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
774; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
775; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
776; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
778; GFX11-NEXT:    v_mul_f32_e64 v2, s4, s4
779; GFX11-NEXT:    s_waitcnt vmcnt(0)
780; GFX11-NEXT:    v_add_f32_e64 v2, v2, v1 clamp
781; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
782; GFX11-NEXT:    v_add_f32_e32 v1, v2, v1
783; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
784; GFX11-NEXT:    s_endpgm
785  %tid = call i32 @llvm.amdgcn.workitem.id.x()
786  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
787  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
788  %b = load float, ptr addrspace(1) %gep0
789
790  %mul = fmul float %a, %a
791  %add = fadd float %mul, %b
792  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
793  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
794  %res = fadd float %clamp, %b
795  store float %res, ptr addrspace(1) %out.gep
796  ret void
797}
798
799define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
800; SI-LABEL: v_clamp_add_src_v2f16_denorm:
801; SI:       ; %bb.0:
802; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
803; SI-NEXT:    s_mov_b32 s7, 0xf000
804; SI-NEXT:    s_mov_b32 s6, 0
805; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
806; SI-NEXT:    v_mov_b32_e32 v1, 0
807; SI-NEXT:    s_waitcnt lgkmcnt(0)
808; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
809; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
810; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
811; SI-NEXT:    s_waitcnt vmcnt(0)
812; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
813; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
814; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
815; SI-NEXT:    v_add_f32_e64 v3, v3, 1.0 clamp
816; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
817; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
818; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
819; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
820; SI-NEXT:    v_or_b32_e32 v2, v2, v3
821; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
822; SI-NEXT:    s_endpgm
823;
824; GFX8-LABEL: v_clamp_add_src_v2f16_denorm:
825; GFX8:       ; %bb.0:
826; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
827; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
828; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
829; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
830; GFX8-NEXT:    v_mov_b32_e32 v1, s3
831; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
832; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
833; GFX8-NEXT:    flat_load_dword v3, v[0:1]
834; GFX8-NEXT:    v_mov_b32_e32 v1, s1
835; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
836; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
837; GFX8-NEXT:    s_waitcnt vmcnt(0)
838; GFX8-NEXT:    v_add_f16_e64 v2, v3, 1.0 clamp
839; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
840; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
841; GFX8-NEXT:    flat_store_dword v[0:1], v2
842; GFX8-NEXT:    s_endpgm
843;
844; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
845; GFX9:       ; %bb.0:
846; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
847; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
848; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
850; GFX9-NEXT:    s_waitcnt vmcnt(0)
851; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
852; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
853; GFX9-NEXT:    s_endpgm
854;
855; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
856; GFX11:       ; %bb.0:
857; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
858; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
859; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
860; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
861; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
862; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
863; GFX11-NEXT:    s_waitcnt vmcnt(0)
864; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
865; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
866; GFX11-NEXT:    s_endpgm
867  %tid = call i32 @llvm.amdgcn.workitem.id.x()
868  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
869  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
870  %a = load <2 x half>, ptr addrspace(1) %gep0
871  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
872  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
873  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
874  store <2 x half> %clamp, ptr addrspace(1) %out.gep
875  ret void
876}
877
878define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 {
879; SI-LABEL: v_clamp_add_src_v2f16_no_denormals:
880; SI:       ; %bb.0:
881; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
882; SI-NEXT:    s_mov_b32 s7, 0xf000
883; SI-NEXT:    s_mov_b32 s6, 0
884; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
885; SI-NEXT:    v_mov_b32_e32 v1, 0
886; SI-NEXT:    s_waitcnt lgkmcnt(0)
887; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
888; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
889; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
890; SI-NEXT:    s_waitcnt vmcnt(0)
891; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
892; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
893; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
894; SI-NEXT:    v_add_f32_e64 v3, v3, 1.0 clamp
895; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
896; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
897; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
898; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
899; SI-NEXT:    v_or_b32_e32 v2, v2, v3
900; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
901; SI-NEXT:    s_endpgm
902;
903; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals:
904; GFX8:       ; %bb.0:
905; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
906; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
907; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
908; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
909; GFX8-NEXT:    v_mov_b32_e32 v1, s3
910; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
911; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
912; GFX8-NEXT:    flat_load_dword v3, v[0:1]
913; GFX8-NEXT:    v_mov_b32_e32 v1, s1
914; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
915; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
916; GFX8-NEXT:    s_waitcnt vmcnt(0)
917; GFX8-NEXT:    v_add_f16_e64 v2, v3, 1.0 clamp
918; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
919; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
920; GFX8-NEXT:    flat_store_dword v[0:1], v2
921; GFX8-NEXT:    s_endpgm
922;
923; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
924; GFX9:       ; %bb.0:
925; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
926; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
927; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
929; GFX9-NEXT:    s_waitcnt vmcnt(0)
930; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
931; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
932; GFX9-NEXT:    s_endpgm
933;
934; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
935; GFX11:       ; %bb.0:
936; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
937; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
938; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
939; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
940; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
942; GFX11-NEXT:    s_waitcnt vmcnt(0)
943; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
944; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
945; GFX11-NEXT:    s_endpgm
946  %tid = call i32 @llvm.amdgcn.workitem.id.x()
947  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
948  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
949  %a = load <2 x half>, ptr addrspace(1) %gep0
950  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
951  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
952  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
953  store <2 x half> %clamp, ptr addrspace(1) %out.gep
954  ret void
955}
956
957define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
958; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg:
959; SI:       ; %bb.0:
960; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
961; SI-NEXT:    s_mov_b32 s7, 0xf000
962; SI-NEXT:    s_mov_b32 s6, 0
963; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
964; SI-NEXT:    v_mov_b32_e32 v1, 0
965; SI-NEXT:    s_waitcnt lgkmcnt(0)
966; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
967; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
968; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
969; SI-NEXT:    s_waitcnt vmcnt(0)
970; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
971; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
972; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
973; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
974; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
975; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
976; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
977; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
978; SI-NEXT:    v_or_b32_e32 v2, v2, v3
979; SI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
980; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
981; SI-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
982; SI-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
983; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
984; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
985; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
986; SI-NEXT:    v_or_b32_e32 v2, v2, v3
987; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
988; SI-NEXT:    s_endpgm
989;
990; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg:
991; GFX8:       ; %bb.0:
992; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
993; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
994; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
995; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
996; GFX8-NEXT:    v_mov_b32_e32 v1, s3
997; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
998; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
999; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1000; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1001; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1002; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1003; GFX8-NEXT:    s_waitcnt vmcnt(0)
1004; GFX8-NEXT:    v_add_f16_e32 v2, 1.0, v3
1005; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1006; GFX8-NEXT:    v_max_f16_sdwa v3, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1007; GFX8-NEXT:    v_max_f16_e64 v2, -v2, -v2 clamp
1008; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
1009; GFX8-NEXT:    flat_store_dword v[0:1], v2
1010; GFX8-NEXT:    s_endpgm
1011;
1012; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
1013; GFX9:       ; %bb.0:
1014; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1015; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1016; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1018; GFX9-NEXT:    s_waitcnt vmcnt(0)
1019; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1020; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
1021; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1022; GFX9-NEXT:    s_endpgm
1023;
1024; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
1025; GFX11:       ; %bb.0:
1026; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1027; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1028; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1029; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1030; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1032; GFX11-NEXT:    s_waitcnt vmcnt(0)
1033; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1034; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
1035; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1036; GFX11-NEXT:    s_endpgm
1037  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1038  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
1039  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1040  %a = load <2 x half>, ptr addrspace(1) %gep0
1041  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
1042  %neg.add = fsub <2 x half> <half -0.0, half -0.0>, %add
1043  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.add, <2 x half> zeroinitializer)
1044  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1045  store <2 x half> %clamp, ptr addrspace(1) %out.gep
1046  ret void
1047}
1048
1049define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1050; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
1051; SI:       ; %bb.0:
1052; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1053; SI-NEXT:    s_mov_b32 s7, 0xf000
1054; SI-NEXT:    s_mov_b32 s6, 0
1055; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1056; SI-NEXT:    v_mov_b32_e32 v1, 0
1057; SI-NEXT:    s_waitcnt lgkmcnt(0)
1058; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1059; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1060; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1061; SI-NEXT:    s_waitcnt vmcnt(0)
1062; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1063; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1064; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1065; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
1066; SI-NEXT:    v_add_f32_e64 v3, v3, 1.0 clamp
1067; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1068; SI-NEXT:    v_max_f32_e64 v2, -v2, -v2 clamp
1069; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1070; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1071; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1072; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1073; SI-NEXT:    s_endpgm
1074;
1075; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
1076; GFX8:       ; %bb.0:
1077; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1078; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1079; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
1080; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1082; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1083; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1084; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1085; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1086; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1087; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1088; GFX8-NEXT:    s_waitcnt vmcnt(0)
1089; GFX8-NEXT:    v_add_f16_e32 v2, 1.0, v3
1090; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1091; GFX8-NEXT:    v_max_f16_e64 v2, -v2, -v2 clamp
1092; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
1093; GFX8-NEXT:    flat_store_dword v[0:1], v2
1094; GFX8-NEXT:    s_endpgm
1095;
1096; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
1097; GFX9:       ; %bb.0:
1098; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1099; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1100; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1102; GFX9-NEXT:    s_waitcnt vmcnt(0)
1103; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1104; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
1105; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1106; GFX9-NEXT:    s_endpgm
1107;
1108; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
1109; GFX11:       ; %bb.0:
1110; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1111; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1113; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1114; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1116; GFX11-NEXT:    s_waitcnt vmcnt(0)
1117; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1118; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
1119; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1120; GFX11-NEXT:    s_endpgm
1121  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1122  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
1123  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1124  %a = load <2 x half>, ptr addrspace(1) %gep0
1125  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
1126  %lo = extractelement <2 x half> %add, i32 0
1127  %neg.lo = fsub half -0.0, %lo
1128  %neg.lo.add = insertelement <2 x half> %add, half %neg.lo, i32 0
1129  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.add, <2 x half> zeroinitializer)
1130  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1131  store <2 x half> %clamp, ptr addrspace(1) %out.gep
1132  ret void
1133}
1134
1135define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1136; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
1137; SI:       ; %bb.0:
1138; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1139; SI-NEXT:    s_mov_b32 s7, 0xf000
1140; SI-NEXT:    s_mov_b32 s6, 0
1141; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1142; SI-NEXT:    v_mov_b32_e32 v1, 0
1143; SI-NEXT:    s_waitcnt lgkmcnt(0)
1144; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1145; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1146; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1147; SI-NEXT:    s_waitcnt vmcnt(0)
1148; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1149; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1150; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1151; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
1152; SI-NEXT:    v_max_f32_e64 v3, -v3, -v3 clamp
1153; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
1154; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1155; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1156; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1157; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1158; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1159; SI-NEXT:    s_endpgm
1160;
1161; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
1162; GFX8:       ; %bb.0:
1163; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1164; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1165; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
1166; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1168; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1169; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1170; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1171; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1172; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1173; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1174; GFX8-NEXT:    s_waitcnt vmcnt(0)
1175; GFX8-NEXT:    v_add_f16_e64 v2, v3, 1.0 clamp
1176; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1177; GFX8-NEXT:    v_max_f16_sdwa v3, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1178; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
1179; GFX8-NEXT:    flat_store_dword v[0:1], v2
1180; GFX8-NEXT:    s_endpgm
1181;
1182; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
1183; GFX9:       ; %bb.0:
1184; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1185; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1188; GFX9-NEXT:    s_waitcnt vmcnt(0)
1189; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1190; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
1191; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1192; GFX9-NEXT:    s_endpgm
1193;
1194; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
1195; GFX11:       ; %bb.0:
1196; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1197; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1198; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1199; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1200; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1202; GFX11-NEXT:    s_waitcnt vmcnt(0)
1203; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1204; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
1205; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1206; GFX11-NEXT:    s_endpgm
1207  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1208  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
1209  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1210  %a = load <2 x half>, ptr addrspace(1) %gep0
1211  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
1212  %hi = extractelement <2 x half> %add, i32 1
1213  %neg.hi = fsub half -0.0, %hi
1214  %neg.hi.add = insertelement <2 x half> %add, half %neg.hi, i32 1
1215  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.add, <2 x half> zeroinitializer)
1216  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1217  store <2 x half> %clamp, ptr addrspace(1) %out.gep
1218  ret void
1219}
1220
1221define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1222; SI-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
1223; SI:       ; %bb.0:
1224; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1225; SI-NEXT:    s_mov_b32 s7, 0xf000
1226; SI-NEXT:    s_mov_b32 s6, 0
1227; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1228; SI-NEXT:    v_mov_b32_e32 v1, 0
1229; SI-NEXT:    s_waitcnt lgkmcnt(0)
1230; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1231; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1232; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1233; SI-NEXT:    s_waitcnt vmcnt(0)
1234; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1235; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1236; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1237; SI-NEXT:    v_add_f32_e64 v3, v3, 1.0 clamp
1238; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1239; SI-NEXT:    v_add_f32_e64 v2, v2, 1.0 clamp
1240; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1241; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1242; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1243; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1244; SI-NEXT:    s_endpgm
1245;
1246; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
1247; GFX8:       ; %bb.0:
1248; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1249; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1250; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
1251; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1253; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1254; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1255; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1256; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1257; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1258; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1259; GFX8-NEXT:    s_waitcnt vmcnt(0)
1260; GFX8-NEXT:    v_add_f16_sdwa v2, v3, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1261; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1262; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
1263; GFX8-NEXT:    flat_store_dword v[0:1], v2
1264; GFX8-NEXT:    s_endpgm
1265;
1266; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
1267; GFX9:       ; %bb.0:
1268; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1269; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1272; GFX9-NEXT:    s_waitcnt vmcnt(0)
1273; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1274; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
1275; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1276; GFX9-NEXT:    s_endpgm
1277;
1278; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
1279; GFX11:       ; %bb.0:
1280; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1281; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1282; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1283; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1284; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1286; GFX11-NEXT:    s_waitcnt vmcnt(0)
1287; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1288; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
1289; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1290; GFX11-NEXT:    s_endpgm
1291  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1292  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
1293  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1294  %a = load <2 x half>, ptr addrspace(1) %gep0
1295  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
1296  %shuf = shufflevector <2 x half> %add, <2 x half> undef, <2 x i32> <i32 1, i32 0>
1297
1298  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
1299  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1300  store <2 x half> %clamp, ptr addrspace(1) %out.gep
1301  ret void
1302}
1303
1304define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1305; SI-LABEL: v_no_clamp_add_src_v2f16_f32_src:
1306; SI:       ; %bb.0:
1307; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1308; SI-NEXT:    s_mov_b32 s7, 0xf000
1309; SI-NEXT:    s_mov_b32 s6, 0
1310; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1311; SI-NEXT:    v_mov_b32_e32 v1, 0
1312; SI-NEXT:    s_waitcnt lgkmcnt(0)
1313; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1314; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1315; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1316; SI-NEXT:    s_waitcnt vmcnt(0)
1317; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
1318; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1319; SI-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
1320; SI-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
1321; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1322; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1323; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1324; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1325; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1326; SI-NEXT:    s_endpgm
1327;
1328; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src:
1329; GFX8:       ; %bb.0:
1330; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1331; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1332; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1334; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1335; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1336; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1337; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1338; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1339; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1340; GFX8-NEXT:    s_waitcnt vmcnt(0)
1341; GFX8-NEXT:    v_add_f32_e32 v2, 1.0, v3
1342; GFX8-NEXT:    v_max_f16_sdwa v3, v2, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1343; GFX8-NEXT:    v_max_f16_e64 v2, v2, v2 clamp
1344; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
1345; GFX8-NEXT:    flat_store_dword v[0:1], v2
1346; GFX8-NEXT:    s_endpgm
1347;
1348; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
1349; GFX9:       ; %bb.0:
1350; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1351; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1352; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1354; GFX9-NEXT:    s_waitcnt vmcnt(0)
1355; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
1356; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
1357; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1358; GFX9-NEXT:    s_endpgm
1359;
1360; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
1361; GFX11:       ; %bb.0:
1362; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1363; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1364; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1365; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1366; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1368; GFX11-NEXT:    s_waitcnt vmcnt(0)
1369; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
1370; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
1371; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1372; GFX11-NEXT:    s_endpgm
1373  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1374  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
1375  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1376  %a = load <2 x half>, ptr addrspace(1) %gep0
1377  %bc = bitcast <2 x half> %a to float
1378  %f32.op = fadd float %bc, 1.0
1379  %f32.op.cast = bitcast float %f32.op to <2 x half>
1380  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %f32.op.cast, <2 x half> zeroinitializer)
1381  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1382  store <2 x half> %clamp, ptr addrspace(1) %out.gep
1383  ret void
1384}
1385
1386define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1387; SI-LABEL: v_no_clamp_add_packed_src_f32:
1388; SI:       ; %bb.0:
1389; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1390; SI-NEXT:    s_mov_b32 s7, 0xf000
1391; SI-NEXT:    s_mov_b32 s6, 0
1392; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1393; SI-NEXT:    v_mov_b32_e32 v1, 0
1394; SI-NEXT:    s_waitcnt lgkmcnt(0)
1395; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1396; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1397; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1398; SI-NEXT:    s_waitcnt vmcnt(0)
1399; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1400; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1401; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1402; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
1403; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1404; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
1405; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1406; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1407; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1408; SI-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1409; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1410; SI-NEXT:    s_endpgm
1411;
1412; GFX8-LABEL: v_no_clamp_add_packed_src_f32:
1413; GFX8:       ; %bb.0:
1414; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1415; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1416; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
1417; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1418; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1419; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1420; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1421; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1422; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1423; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1424; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1425; GFX8-NEXT:    s_waitcnt vmcnt(0)
1426; GFX8-NEXT:    v_add_f16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1427; GFX8-NEXT:    v_add_f16_e32 v3, 1.0, v3
1428; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
1429; GFX8-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1430; GFX8-NEXT:    flat_store_dword v[0:1], v2
1431; GFX8-NEXT:    s_endpgm
1432;
1433; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
1434; GFX9:       ; %bb.0:
1435; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1436; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1437; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1438; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1439; GFX9-NEXT:    s_waitcnt vmcnt(0)
1440; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1441; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1442; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1443; GFX9-NEXT:    s_endpgm
1444;
1445; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
1446; GFX11:       ; %bb.0:
1447; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1448; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1449; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1450; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1451; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1452; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1453; GFX11-NEXT:    s_waitcnt vmcnt(0)
1454; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
1455; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1456; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1457; GFX11-NEXT:    s_endpgm
1458  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1459  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
1460  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1461  %a = load <2 x half>, ptr addrspace(1) %gep0
1462  %add = fadd <2 x half> %a, <half 1.0, half 1.0>
1463  %bc.add = bitcast <2 x half> %add to float
1464  %max = call float @llvm.maxnum.f32(float %bc.add, float 0.0)
1465  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
1466  store float %clamp, ptr addrspace(1) %out.gep
1467  ret void
1468}
1469
1470; Since the high bits are zeroed, it probably would be OK in this case
1471; to use clamp.
1472define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1473; SI-LABEL: v_no_clamp_add_src_v2f16_f16_src:
1474; SI:       ; %bb.0:
1475; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1476; SI-NEXT:    s_mov_b32 s7, 0xf000
1477; SI-NEXT:    s_mov_b32 s6, 0
1478; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1479; SI-NEXT:    v_mov_b32_e32 v2, 0
1480; SI-NEXT:    s_waitcnt lgkmcnt(0)
1481; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1482; SI-NEXT:    buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64
1483; SI-NEXT:    v_cvt_f16_f32_e32 v3, 0
1484; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1485; SI-NEXT:    s_waitcnt vmcnt(0)
1486; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1487; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
1488; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1489; SI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
1490; SI-NEXT:    v_cvt_f16_f32_e32 v4, v1
1491; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1492; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
1493; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1494; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1495; SI-NEXT:    s_endpgm
1496;
1497; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src:
1498; GFX8:       ; %bb.0:
1499; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1500; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1501; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1502; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1504; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1505; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1506; GFX8-NEXT:    flat_load_ushort v2, v[1:2]
1507; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1508; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1509; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1510; GFX8-NEXT:    s_waitcnt vmcnt(0)
1511; GFX8-NEXT:    v_add_f16_e64 v2, v2, 1.0 clamp
1512; GFX8-NEXT:    flat_store_dword v[0:1], v2
1513; GFX8-NEXT:    s_endpgm
1514;
1515; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
1516; GFX9:       ; %bb.0:
1517; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1518; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1519; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1520; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1521; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
1522; GFX9-NEXT:    s_waitcnt vmcnt(0)
1523; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
1524; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
1525; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1526; GFX9-NEXT:    s_endpgm
1527;
1528; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
1529; GFX11:       ; %bb.0:
1530; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1531; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1532; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1533; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1534; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1535; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1536; GFX11-NEXT:    global_load_u16 v1, v1, s[2:3]
1537; GFX11-NEXT:    s_waitcnt vmcnt(0)
1538; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
1539; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1540; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1541; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
1542; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1543; GFX11-NEXT:    s_endpgm
1544  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1545  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
1546  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1547  %a = load half, ptr addrspace(1) %gep0
1548  %add = fadd half %a, 1.0
1549  %bc = bitcast half %add to i16
1550  %zext = zext i16 %bc to i32
1551  %v2f16 = bitcast i32 %zext to <2 x half>
1552  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %v2f16, <2 x half> zeroinitializer)
1553  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1554  store <2 x half> %clamp, ptr addrspace(1) %out.gep
1555  ret void
1556}
1557
1558; FIXME: Worse code pre-gfx9
1559
1560define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 {
1561; SI-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
1562; SI:       ; %bb.0:
1563; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1564; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v1
1565; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1566; SI-NEXT:    v_cvt_f32_f16_e64 v0, v0 clamp
1567; SI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
1568; SI-NEXT:    s_setpc_b64 s[30:31]
1569;
1570; GFX8-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
1571; GFX8:       ; %bb.0:
1572; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1573; GFX8-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
1574; GFX8-NEXT:    v_max_f16_sdwa v1, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1575; GFX8-NEXT:    v_max_f16_e64 v0, v0, v0 clamp
1576; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
1577; GFX8-NEXT:    s_setpc_b64 s[30:31]
1578;
1579; GFX9-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
1580; GFX9:       ; %bb.0:
1581; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1582; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1 clamp
1583; GFX9-NEXT:    s_setpc_b64 s[30:31]
1584;
1585; GFX11-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
1586; GFX11:       ; %bb.0:
1587; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1588; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v0, v0, v1 clamp
1589; GFX11-NEXT:    s_setpc_b64 s[30:31]
1590  %add = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
1591  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
1592  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
1593  ret <2 x half> %clamp
1594}
1595
1596declare i32 @llvm.amdgcn.workitem.id.x() #1
1597declare float @llvm.fabs.f32(float) #1
1598declare float @llvm.floor.f32(float) #1
1599declare float @llvm.minnum.f32(float, float) #1
1600declare float @llvm.maxnum.f32(float, float) #1
1601declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
1602declare double @llvm.fabs.f64(double) #1
1603declare double @llvm.minnum.f64(double, double) #1
1604declare double @llvm.maxnum.f64(double, double) #1
1605declare half @llvm.fabs.f16(half) #1
1606declare half @llvm.minnum.f16(half, half) #1
1607declare half @llvm.maxnum.f16(half, half) #1
1608declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
1609declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
1610declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
1611declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
1612declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
1613
1614
1615declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
1616
1617attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1618attributes #1 = { nounwind readnone }
1619attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
1620attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "denormal-fp-math"="preserve-sign,preserve-sign" }
1621
1622!llvm.dbg.cu = !{!0}
1623!llvm.module.flags = !{!2, !3}
1624
1625!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
1626!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
1627!2 = !{i32 2, !"Dwarf Version", i32 4}
1628!3 = !{i32 2, !"Debug Info Version", i32 3}
1629!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
1630!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
1631!6 = !DISubroutineType(types: !7)
1632!7 = !{null, !8}
1633!8 = !DIBasicType(name: "float", size: 32, align: 32)
1634!9 = !DIExpression()
1635!10 = !DILocation(line: 1, column: 42, scope: !5)
1636