xref: /llvm-project/llvm/test/CodeGen/AMDGPU/clamp.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
7
8define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
9; GFX6-LABEL: v_clamp_f32:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13; GFX6-NEXT:    s_mov_b32 s6, 0
14; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
15; GFX6-NEXT:    v_mov_b32_e32 v1, 0
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
18; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
19; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
20; GFX6-NEXT:    s_waitcnt vmcnt(0)
21; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
22; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
23; GFX6-NEXT:    s_endpgm
24;
25; GFX8-LABEL: v_clamp_f32:
26; GFX8:       ; %bb.0:
27; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
28; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
29; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX8-NEXT:    v_mov_b32_e32 v1, s3
31; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
32; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
33; GFX8-NEXT:    flat_load_dword v3, v[0:1]
34; GFX8-NEXT:    v_mov_b32_e32 v1, s1
35; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
36; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
37; GFX8-NEXT:    s_waitcnt vmcnt(0)
38; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
39; GFX8-NEXT:    flat_store_dword v[0:1], v2
40; GFX8-NEXT:    s_endpgm
41;
42; GFX9-LABEL: v_clamp_f32:
43; GFX9:       ; %bb.0:
44; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
45; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
46; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
48; GFX9-NEXT:    s_waitcnt vmcnt(0)
49; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
50; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
51; GFX9-NEXT:    s_endpgm
52;
53; GFX11-LABEL: v_clamp_f32:
54; GFX11:       ; %bb.0:
55; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
56; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
57; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
58; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
59; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
61; GFX11-NEXT:    s_waitcnt vmcnt(0)
62; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
63; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
64; GFX11-NEXT:    s_endpgm
65;
66; GFX12-LABEL: v_clamp_f32:
67; GFX12:       ; %bb.0:
68; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
69; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
70; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
71; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
72; GFX12-NEXT:    s_wait_kmcnt 0x0
73; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
74; GFX12-NEXT:    s_wait_loadcnt 0x0
75; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
76; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
77; GFX12-NEXT:    s_endpgm
78  %tid = call i32 @llvm.amdgcn.workitem.id.x()
79  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
80  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
81  %a = load float, ptr addrspace(1) %gep0
82  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
83  %med = call float @llvm.minnum.f32(float %max, float 1.0)
84
85  store float %med, ptr addrspace(1) %out.gep
86  ret void
87}
88
89define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
90; GFX6-LABEL: v_clamp_neg_f32:
91; GFX6:       ; %bb.0:
92; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
93; GFX6-NEXT:    s_mov_b32 s7, 0xf000
94; GFX6-NEXT:    s_mov_b32 s6, 0
95; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
96; GFX6-NEXT:    v_mov_b32_e32 v1, 0
97; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
99; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
100; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
101; GFX6-NEXT:    s_waitcnt vmcnt(0)
102; GFX6-NEXT:    v_max_f32_e64 v2, -v2, -v2 clamp
103; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
104; GFX6-NEXT:    s_endpgm
105;
106; GFX8-LABEL: v_clamp_neg_f32:
107; GFX8:       ; %bb.0:
108; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
109; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
110; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX8-NEXT:    v_mov_b32_e32 v1, s3
112; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
113; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
114; GFX8-NEXT:    flat_load_dword v3, v[0:1]
115; GFX8-NEXT:    v_mov_b32_e32 v1, s1
116; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
117; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
118; GFX8-NEXT:    s_waitcnt vmcnt(0)
119; GFX8-NEXT:    v_max_f32_e64 v2, -v3, -v3 clamp
120; GFX8-NEXT:    flat_store_dword v[0:1], v2
121; GFX8-NEXT:    s_endpgm
122;
123; GFX9-LABEL: v_clamp_neg_f32:
124; GFX9:       ; %bb.0:
125; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
126; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
129; GFX9-NEXT:    s_waitcnt vmcnt(0)
130; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
131; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
132; GFX9-NEXT:    s_endpgm
133;
134; GFX11-LABEL: v_clamp_neg_f32:
135; GFX11:       ; %bb.0:
136; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
137; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
139; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
140; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
142; GFX11-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
144; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
145; GFX11-NEXT:    s_endpgm
146;
147; GFX12-LABEL: v_clamp_neg_f32:
148; GFX12:       ; %bb.0:
149; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
150; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
151; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
152; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
153; GFX12-NEXT:    s_wait_kmcnt 0x0
154; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
155; GFX12-NEXT:    s_wait_loadcnt 0x0
156; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1 clamp
157; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
158; GFX12-NEXT:    s_endpgm
159  %tid = call i32 @llvm.amdgcn.workitem.id.x()
160  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
161  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
162  %a = load float, ptr addrspace(1) %gep0
163  %fneg.a = fneg float %a
164  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
165  %med = call float @llvm.minnum.f32(float %max, float 1.0)
166
167  store float %med, ptr addrspace(1) %out.gep
168  ret void
169}
170
171define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
172; GFX6-LABEL: v_clamp_negabs_f32:
173; GFX6:       ; %bb.0:
174; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
175; GFX6-NEXT:    s_mov_b32 s7, 0xf000
176; GFX6-NEXT:    s_mov_b32 s6, 0
177; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
178; GFX6-NEXT:    v_mov_b32_e32 v1, 0
179; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
181; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
182; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
183; GFX6-NEXT:    s_waitcnt vmcnt(0)
184; GFX6-NEXT:    v_max_f32_e64 v2, -|v2|, -|v2| clamp
185; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
186; GFX6-NEXT:    s_endpgm
187;
188; GFX8-LABEL: v_clamp_negabs_f32:
189; GFX8:       ; %bb.0:
190; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
191; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
192; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX8-NEXT:    v_mov_b32_e32 v1, s3
194; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
195; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
196; GFX8-NEXT:    flat_load_dword v3, v[0:1]
197; GFX8-NEXT:    v_mov_b32_e32 v1, s1
198; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
199; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
200; GFX8-NEXT:    s_waitcnt vmcnt(0)
201; GFX8-NEXT:    v_max_f32_e64 v2, -|v3|, -|v3| clamp
202; GFX8-NEXT:    flat_store_dword v[0:1], v2
203; GFX8-NEXT:    s_endpgm
204;
205; GFX9-LABEL: v_clamp_negabs_f32:
206; GFX9:       ; %bb.0:
207; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
208; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
209; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
211; GFX9-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
213; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
214; GFX9-NEXT:    s_endpgm
215;
216; GFX11-LABEL: v_clamp_negabs_f32:
217; GFX11:       ; %bb.0:
218; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
219; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
220; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
221; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
222; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
224; GFX11-NEXT:    s_waitcnt vmcnt(0)
225; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
226; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
227; GFX11-NEXT:    s_endpgm
228;
229; GFX12-LABEL: v_clamp_negabs_f32:
230; GFX12:       ; %bb.0:
231; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
232; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
233; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
234; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
235; GFX12-NEXT:    s_wait_kmcnt 0x0
236; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
237; GFX12-NEXT:    s_wait_loadcnt 0x0
238; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1| clamp
239; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
240; GFX12-NEXT:    s_endpgm
241  %tid = call i32 @llvm.amdgcn.workitem.id.x()
242  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
243  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
244  %a = load float, ptr addrspace(1) %gep0
245  %fabs.a = call float @llvm.fabs.f32(float %a)
246  %fneg.fabs.a = fneg float %fabs.a
247
248  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
249  %med = call float @llvm.minnum.f32(float %max, float 1.0)
250
251  store float %med, ptr addrspace(1) %out.gep
252  ret void
253}
254
255define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
256; GFX6-LABEL: v_clamp_negzero_f32:
257; GFX6:       ; %bb.0:
258; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
259; GFX6-NEXT:    s_mov_b32 s7, 0xf000
260; GFX6-NEXT:    s_mov_b32 s6, 0
261; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
262; GFX6-NEXT:    v_mov_b32_e32 v1, 0
263; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
265; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
266; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
267; GFX6-NEXT:    s_waitcnt vmcnt(0)
268; GFX6-NEXT:    v_add_f32_e32 v2, 0.5, v2
269; GFX6-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
270; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
271; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
272; GFX6-NEXT:    s_endpgm
273;
274; GFX8-LABEL: v_clamp_negzero_f32:
275; GFX8:       ; %bb.0:
276; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
277; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
278; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX8-NEXT:    v_mov_b32_e32 v1, s3
280; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
281; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
282; GFX8-NEXT:    flat_load_dword v3, v[0:1]
283; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
284; GFX8-NEXT:    v_mov_b32_e32 v1, s1
285; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
286; GFX8-NEXT:    s_waitcnt vmcnt(0)
287; GFX8-NEXT:    v_add_f32_e32 v2, 0.5, v3
288; GFX8-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
289; GFX8-NEXT:    v_min_f32_e32 v2, 1.0, v2
290; GFX8-NEXT:    flat_store_dword v[0:1], v2
291; GFX8-NEXT:    s_endpgm
292;
293; GFX9-LABEL: v_clamp_negzero_f32:
294; GFX9:       ; %bb.0:
295; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
296; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
299; GFX9-NEXT:    s_waitcnt vmcnt(0)
300; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
301; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
302; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
303; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
304; GFX9-NEXT:    s_endpgm
305;
306; GFX11-LABEL: v_clamp_negzero_f32:
307; GFX11:       ; %bb.0:
308; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
309; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
310; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
311; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
312; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
314; GFX11-NEXT:    s_waitcnt vmcnt(0)
315; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
316; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
317; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
318; GFX11-NEXT:    s_endpgm
319;
320; GFX12-LABEL: v_clamp_negzero_f32:
321; GFX12:       ; %bb.0:
322; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
323; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
324; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
325; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
326; GFX12-NEXT:    s_wait_kmcnt 0x0
327; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
328; GFX12-NEXT:    s_wait_loadcnt 0x0
329; GFX12-NEXT:    v_add_f32_e32 v1, 0.5, v1
330; GFX12-NEXT:    v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
331; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
332; GFX12-NEXT:    s_endpgm
333  %tid = call i32 @llvm.amdgcn.workitem.id.x()
334  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
335  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
336  %a = load float, ptr addrspace(1) %gep0
337  %add = fadd nnan float %a, 0.5
338  %max = call float @llvm.maxnum.f32(float %add, float -0.0)
339  %med = call float @llvm.minnum.f32(float %max, float 1.0)
340
341  store float %med, ptr addrspace(1) %out.gep
342  ret void
343}
344
345; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
346; matched through med3, not if directly. Is this correct?
347define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
348; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32:
349; GFX6:       ; %bb.0:
350; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
351; GFX6-NEXT:    s_mov_b32 s7, 0xf000
352; GFX6-NEXT:    s_mov_b32 s6, 0
353; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
354; GFX6-NEXT:    v_mov_b32_e32 v1, 0
355; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
357; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
358; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
359; GFX6-NEXT:    s_waitcnt vmcnt(0)
360; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
361; GFX6-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
362; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
363; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
364; GFX6-NEXT:    s_endpgm
365;
366; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32:
367; GFX8:       ; %bb.0:
368; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
369; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
370; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
371; GFX8-NEXT:    v_mov_b32_e32 v1, s3
372; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
373; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
374; GFX8-NEXT:    flat_load_dword v3, v[0:1]
375; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
376; GFX8-NEXT:    v_mov_b32_e32 v1, s1
377; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
378; GFX8-NEXT:    s_waitcnt vmcnt(0)
379; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
380; GFX8-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
381; GFX8-NEXT:    v_min_f32_e32 v2, 1.0, v2
382; GFX8-NEXT:    flat_store_dword v[0:1], v2
383; GFX8-NEXT:    s_endpgm
384;
385; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
386; GFX9:       ; %bb.0:
387; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
388; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
389; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
391; GFX9-NEXT:    s_waitcnt vmcnt(0)
392; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
393; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
394; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
395; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
396; GFX9-NEXT:    s_endpgm
397;
398; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
399; GFX11:       ; %bb.0:
400; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
401; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
402; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
403; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
404; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
406; GFX11-NEXT:    s_waitcnt vmcnt(0)
407; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
408; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
409; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
410; GFX11-NEXT:    s_endpgm
411;
412; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32:
413; GFX12:       ; %bb.0:
414; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
415; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
416; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
417; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
418; GFX12-NEXT:    s_wait_kmcnt 0x0
419; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
420; GFX12-NEXT:    s_wait_loadcnt 0x0
421; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
422; GFX12-NEXT:    v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
423; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
424; GFX12-NEXT:    s_endpgm
425  %tid = call i32 @llvm.amdgcn.workitem.id.x()
426  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
427  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
428  %a = load float, ptr addrspace(1) %gep0
429  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
430  %med = call float @llvm.minnum.f32(float %max, float 1.0)
431
432  store float %med, ptr addrspace(1) %out.gep
433  ret void
434}
435
436define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
437; GFX6-LABEL: v_clamp_multi_use_max_f32:
438; GFX6:       ; %bb.0:
439; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
440; GFX6-NEXT:    s_mov_b32 s6, 0
441; GFX6-NEXT:    s_mov_b32 s7, 0xf000
442; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
443; GFX6-NEXT:    v_mov_b32_e32 v1, 0
444; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
446; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
447; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
448; GFX6-NEXT:    s_mov_b32 s6, -1
449; GFX6-NEXT:    s_waitcnt vmcnt(0)
450; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
451; GFX6-NEXT:    v_max_f32_e32 v2, 0, v2
452; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v2
453; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
454; GFX6-NEXT:    buffer_store_dword v2, off, s[4:7], 0
455; GFX6-NEXT:    s_waitcnt vmcnt(0)
456; GFX6-NEXT:    s_endpgm
457;
458; GFX8-LABEL: v_clamp_multi_use_max_f32:
459; GFX8:       ; %bb.0:
460; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
461; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
462; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX8-NEXT:    v_mov_b32_e32 v1, s3
464; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
465; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
466; GFX8-NEXT:    flat_load_dword v3, v[0:1]
467; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
468; GFX8-NEXT:    v_mov_b32_e32 v1, s1
469; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
470; GFX8-NEXT:    s_waitcnt vmcnt(0)
471; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
472; GFX8-NEXT:    v_max_f32_e32 v2, 0, v2
473; GFX8-NEXT:    v_min_f32_e32 v3, 1.0, v2
474; GFX8-NEXT:    flat_store_dword v[0:1], v3
475; GFX8-NEXT:    flat_store_dword v[0:1], v2
476; GFX8-NEXT:    s_waitcnt vmcnt(0)
477; GFX8-NEXT:    s_endpgm
478;
479; GFX9-LABEL: v_clamp_multi_use_max_f32:
480; GFX9:       ; %bb.0:
481; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
482; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
485; GFX9-NEXT:    s_waitcnt vmcnt(0)
486; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
487; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
488; GFX9-NEXT:    v_min_f32_e32 v2, 1.0, v1
489; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
490; GFX9-NEXT:    global_store_dword v[0:1], v1, off
491; GFX9-NEXT:    s_waitcnt vmcnt(0)
492; GFX9-NEXT:    s_endpgm
493;
494; GFX11-LABEL: v_clamp_multi_use_max_f32:
495; GFX11:       ; %bb.0:
496; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
497; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
498; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
499; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
500; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
502; GFX11-NEXT:    s_waitcnt vmcnt(0)
503; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
504; GFX11-NEXT:    v_max_f32_e32 v1, 0, v1
505; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
506; GFX11-NEXT:    v_min_f32_e32 v2, 1.0, v1
507; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
508; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
509; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
510; GFX11-NEXT:    s_endpgm
511;
512; GFX12-LABEL: v_clamp_multi_use_max_f32:
513; GFX12:       ; %bb.0:
514; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
515; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
516; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
517; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
518; GFX12-NEXT:    s_wait_kmcnt 0x0
519; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
520; GFX12-NEXT:    s_wait_loadcnt 0x0
521; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
522; GFX12-NEXT:    v_max_num_f32_e32 v1, 0, v1
523; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
524; GFX12-NEXT:    v_min_num_f32_e32 v2, 1.0, v1
525; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
526; GFX12-NEXT:    s_wait_storecnt 0x0
527; GFX12-NEXT:    global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
528; GFX12-NEXT:    s_wait_storecnt 0x0
529; GFX12-NEXT:    s_endpgm
530  %tid = call i32 @llvm.amdgcn.workitem.id.x()
531  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
532  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
533  %a = load float, ptr addrspace(1) %gep0
534  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
535  %med = call float @llvm.minnum.f32(float %max, float 1.0)
536
537  store float %med, ptr addrspace(1) %out.gep
538  store volatile float %max, ptr addrspace(1) undef
539  ret void
540}
541
542define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
543; GFX6-LABEL: v_clamp_f16:
544; GFX6:       ; %bb.0:
545; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
546; GFX6-NEXT:    s_mov_b32 s7, 0xf000
547; GFX6-NEXT:    s_mov_b32 s6, 0
548; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
549; GFX6-NEXT:    v_mov_b32_e32 v1, 0
550; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
552; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
553; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
554; GFX6-NEXT:    s_waitcnt vmcnt(0)
555; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
556; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
557; GFX6-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
558; GFX6-NEXT:    s_endpgm
559;
560; GFX8-LABEL: v_clamp_f16:
561; GFX8:       ; %bb.0:
562; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
563; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
564; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX8-NEXT:    v_mov_b32_e32 v1, s3
566; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
567; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
568; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
569; GFX8-NEXT:    v_mov_b32_e32 v1, s1
570; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
571; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
572; GFX8-NEXT:    s_waitcnt vmcnt(0)
573; GFX8-NEXT:    v_max_f16_e64 v2, v3, v3 clamp
574; GFX8-NEXT:    flat_store_short v[0:1], v2
575; GFX8-NEXT:    s_endpgm
576;
577; GFX9-LABEL: v_clamp_f16:
578; GFX9:       ; %bb.0:
579; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
580; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
581; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
583; GFX9-NEXT:    s_waitcnt vmcnt(0)
584; GFX9-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
585; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
586; GFX9-NEXT:    s_endpgm
587;
588; GFX11-LABEL: v_clamp_f16:
589; GFX11:       ; %bb.0:
590; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
591; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
592; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
593; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
594; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
596; GFX11-NEXT:    s_waitcnt vmcnt(0)
597; GFX11-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
598; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
599; GFX11-NEXT:    s_endpgm
600;
601; GFX12-LABEL: v_clamp_f16:
602; GFX12:       ; %bb.0:
603; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
604; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
605; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
606; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
607; GFX12-NEXT:    s_wait_kmcnt 0x0
608; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
609; GFX12-NEXT:    s_wait_loadcnt 0x0
610; GFX12-NEXT:    v_max_num_f16_e64 v1, v1, v1 clamp
611; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
612; GFX12-NEXT:    s_endpgm
613  %tid = call i32 @llvm.amdgcn.workitem.id.x()
614  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
615  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
616  %a = load half, ptr addrspace(1) %gep0
617  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
618  %med = call half @llvm.minnum.f16(half %max, half 1.0)
619
620  store half %med, ptr addrspace(1) %out.gep
621  ret void
622}
623
624define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
625; GFX6-LABEL: v_clamp_neg_f16:
626; GFX6:       ; %bb.0:
627; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
628; GFX6-NEXT:    s_mov_b32 s7, 0xf000
629; GFX6-NEXT:    s_mov_b32 s6, 0
630; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
631; GFX6-NEXT:    v_mov_b32_e32 v1, 0
632; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
634; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
635; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
636; GFX6-NEXT:    s_waitcnt vmcnt(0)
637; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -v2 clamp
638; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
639; GFX6-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
640; GFX6-NEXT:    s_endpgm
641;
642; GFX8-LABEL: v_clamp_neg_f16:
643; GFX8:       ; %bb.0:
644; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
645; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
646; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
647; GFX8-NEXT:    v_mov_b32_e32 v1, s3
648; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
649; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
650; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
651; GFX8-NEXT:    v_mov_b32_e32 v1, s1
652; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
653; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
654; GFX8-NEXT:    s_waitcnt vmcnt(0)
655; GFX8-NEXT:    v_max_f16_e64 v2, -v3, -v3 clamp
656; GFX8-NEXT:    flat_store_short v[0:1], v2
657; GFX8-NEXT:    s_endpgm
658;
659; GFX9-LABEL: v_clamp_neg_f16:
660; GFX9:       ; %bb.0:
661; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
662; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
663; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
665; GFX9-NEXT:    s_waitcnt vmcnt(0)
666; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
667; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
668; GFX9-NEXT:    s_endpgm
669;
670; GFX11-LABEL: v_clamp_neg_f16:
671; GFX11:       ; %bb.0:
672; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
673; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
674; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
675; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
676; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
678; GFX11-NEXT:    s_waitcnt vmcnt(0)
679; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
680; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
681; GFX11-NEXT:    s_endpgm
682;
683; GFX12-LABEL: v_clamp_neg_f16:
684; GFX12:       ; %bb.0:
685; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
686; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
687; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
688; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
689; GFX12-NEXT:    s_wait_kmcnt 0x0
690; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
691; GFX12-NEXT:    s_wait_loadcnt 0x0
692; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1 clamp
693; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
694; GFX12-NEXT:    s_endpgm
695  %tid = call i32 @llvm.amdgcn.workitem.id.x()
696  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
697  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
698  %a = load half, ptr addrspace(1) %gep0
699  %fneg.a = fneg half %a
700  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
701  %med = call half @llvm.minnum.f16(half %max, half 1.0)
702
703  store half %med, ptr addrspace(1) %out.gep
704  ret void
705}
706
707define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
708; GFX6-LABEL: v_clamp_negabs_f16:
709; GFX6:       ; %bb.0:
710; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
711; GFX6-NEXT:    s_mov_b32 s7, 0xf000
712; GFX6-NEXT:    s_mov_b32 s6, 0
713; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
714; GFX6-NEXT:    v_mov_b32_e32 v1, 0
715; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
717; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
718; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
719; GFX6-NEXT:    s_waitcnt vmcnt(0)
720; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -|v2| clamp
721; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
722; GFX6-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
723; GFX6-NEXT:    s_endpgm
724;
725; GFX8-LABEL: v_clamp_negabs_f16:
726; GFX8:       ; %bb.0:
727; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
728; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
729; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX8-NEXT:    v_mov_b32_e32 v1, s3
731; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
732; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
733; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
734; GFX8-NEXT:    v_mov_b32_e32 v1, s1
735; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
736; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
737; GFX8-NEXT:    s_waitcnt vmcnt(0)
738; GFX8-NEXT:    v_max_f16_e64 v2, -|v3|, -|v3| clamp
739; GFX8-NEXT:    flat_store_short v[0:1], v2
740; GFX8-NEXT:    s_endpgm
741;
742; GFX9-LABEL: v_clamp_negabs_f16:
743; GFX9:       ; %bb.0:
744; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
745; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
746; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
747; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
748; GFX9-NEXT:    s_waitcnt vmcnt(0)
749; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
750; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
751; GFX9-NEXT:    s_endpgm
752;
753; GFX11-LABEL: v_clamp_negabs_f16:
754; GFX11:       ; %bb.0:
755; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
756; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
757; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
758; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
759; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
761; GFX11-NEXT:    s_waitcnt vmcnt(0)
762; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
763; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
764; GFX11-NEXT:    s_endpgm
765;
766; GFX12-LABEL: v_clamp_negabs_f16:
767; GFX12:       ; %bb.0:
768; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
769; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
770; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
771; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
772; GFX12-NEXT:    s_wait_kmcnt 0x0
773; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
774; GFX12-NEXT:    s_wait_loadcnt 0x0
775; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
776; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
777; GFX12-NEXT:    s_endpgm
778  %tid = call i32 @llvm.amdgcn.workitem.id.x()
779  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
780  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
781  %a = load half, ptr addrspace(1) %gep0
782  %fabs.a = call half @llvm.fabs.f16(half %a)
783  %fneg.fabs.a = fneg half %fabs.a
784
785  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
786  %med = call half @llvm.minnum.f16(half %max, half 1.0)
787
788  store half %med, ptr addrspace(1) %out.gep
789  ret void
790}
791
792define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
793; GFX6-LABEL: v_clamp_f64:
794; GFX6:       ; %bb.0:
795; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
796; GFX6-NEXT:    s_mov_b32 s7, 0xf000
797; GFX6-NEXT:    s_mov_b32 s6, 0
798; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
799; GFX6-NEXT:    v_mov_b32_e32 v1, 0
800; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
802; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
803; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
804; GFX6-NEXT:    s_waitcnt vmcnt(0)
805; GFX6-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3] clamp
806; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
807; GFX6-NEXT:    s_endpgm
808;
809; GFX8-LABEL: v_clamp_f64:
810; GFX8:       ; %bb.0:
811; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
812; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
813; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX8-NEXT:    v_mov_b32_e32 v1, s3
815; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
816; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
817; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
818; GFX8-NEXT:    v_mov_b32_e32 v3, s1
819; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
820; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
821; GFX8-NEXT:    s_waitcnt vmcnt(0)
822; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
823; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
824; GFX8-NEXT:    s_endpgm
825;
826; GFX9-LABEL: v_clamp_f64:
827; GFX9:       ; %bb.0:
828; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
829; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
830; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
831; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
832; GFX9-NEXT:    s_waitcnt vmcnt(0)
833; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
834; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
835; GFX9-NEXT:    s_endpgm
836;
837; GFX11-LABEL: v_clamp_f64:
838; GFX11:       ; %bb.0:
839; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
840; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
841; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
842; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
843; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
845; GFX11-NEXT:    s_waitcnt vmcnt(0)
846; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
847; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
848; GFX11-NEXT:    s_endpgm
849;
850; GFX12-LABEL: v_clamp_f64:
851; GFX12:       ; %bb.0:
852; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
853; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
854; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
855; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
856; GFX12-NEXT:    s_wait_kmcnt 0x0
857; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
858; GFX12-NEXT:    s_wait_loadcnt 0x0
859; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
860; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
861; GFX12-NEXT:    s_endpgm
862  %tid = call i32 @llvm.amdgcn.workitem.id.x()
863  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
864  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
865  %a = load double, ptr addrspace(1) %gep0
866  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
867  %med = call double @llvm.minnum.f64(double %max, double 1.0)
868
869  store double %med, ptr addrspace(1) %out.gep
870  ret void
871}
872
873define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
874; GFX6-LABEL: v_clamp_neg_f64:
875; GFX6:       ; %bb.0:
876; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
877; GFX6-NEXT:    s_mov_b32 s7, 0xf000
878; GFX6-NEXT:    s_mov_b32 s6, 0
879; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
880; GFX6-NEXT:    v_mov_b32_e32 v1, 0
881; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
883; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
884; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
885; GFX6-NEXT:    s_waitcnt vmcnt(0)
886; GFX6-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3] clamp
887; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
888; GFX6-NEXT:    s_endpgm
889;
890; GFX8-LABEL: v_clamp_neg_f64:
891; GFX8:       ; %bb.0:
892; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
893; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
894; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
895; GFX8-NEXT:    v_mov_b32_e32 v1, s3
896; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
897; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
898; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
899; GFX8-NEXT:    v_mov_b32_e32 v3, s1
900; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
901; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
902; GFX8-NEXT:    s_waitcnt vmcnt(0)
903; GFX8-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
904; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
905; GFX8-NEXT:    s_endpgm
906;
907; GFX9-LABEL: v_clamp_neg_f64:
908; GFX9:       ; %bb.0:
909; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
910; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
911; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
913; GFX9-NEXT:    s_waitcnt vmcnt(0)
914; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
915; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
916; GFX9-NEXT:    s_endpgm
917;
918; GFX11-LABEL: v_clamp_neg_f64:
919; GFX11:       ; %bb.0:
920; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
921; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
922; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
923; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
924; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
925; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
926; GFX11-NEXT:    s_waitcnt vmcnt(0)
927; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
928; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
929; GFX11-NEXT:    s_endpgm
930;
931; GFX12-LABEL: v_clamp_neg_f64:
932; GFX12:       ; %bb.0:
933; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
934; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
935; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
936; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
937; GFX12-NEXT:    s_wait_kmcnt 0x0
938; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
939; GFX12-NEXT:    s_wait_loadcnt 0x0
940; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
941; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
942; GFX12-NEXT:    s_endpgm
943  %tid = call i32 @llvm.amdgcn.workitem.id.x()
944  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
945  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
946  %a = load double, ptr addrspace(1) %gep0
947  %fneg.a = fneg double %a
948  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
949  %med = call double @llvm.minnum.f64(double %max, double 1.0)
950
951  store double %med, ptr addrspace(1) %out.gep
952  ret void
953}
954
955define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
956; GFX6-LABEL: v_clamp_negabs_f64:
957; GFX6:       ; %bb.0:
958; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
959; GFX6-NEXT:    s_mov_b32 s7, 0xf000
960; GFX6-NEXT:    s_mov_b32 s6, 0
961; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
962; GFX6-NEXT:    v_mov_b32_e32 v1, 0
963; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
965; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
966; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
967; GFX6-NEXT:    s_waitcnt vmcnt(0)
968; GFX6-NEXT:    v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| clamp
969; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
970; GFX6-NEXT:    s_endpgm
971;
972; GFX8-LABEL: v_clamp_negabs_f64:
973; GFX8:       ; %bb.0:
974; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
975; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
976; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX8-NEXT:    v_mov_b32_e32 v1, s3
978; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
979; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
980; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
981; GFX8-NEXT:    v_mov_b32_e32 v3, s1
982; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
983; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
984; GFX8-NEXT:    s_waitcnt vmcnt(0)
985; GFX8-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
986; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
987; GFX8-NEXT:    s_endpgm
988;
989; GFX9-LABEL: v_clamp_negabs_f64:
990; GFX9:       ; %bb.0:
991; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
992; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
993; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
995; GFX9-NEXT:    s_waitcnt vmcnt(0)
996; GFX9-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
997; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
998; GFX9-NEXT:    s_endpgm
999;
1000; GFX11-LABEL: v_clamp_negabs_f64:
1001; GFX11:       ; %bb.0:
1002; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1003; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1004; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1005; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1006; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1008; GFX11-NEXT:    s_waitcnt vmcnt(0)
1009; GFX11-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
1010; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1011; GFX11-NEXT:    s_endpgm
1012;
1013; GFX12-LABEL: v_clamp_negabs_f64:
1014; GFX12:       ; %bb.0:
1015; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1016; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1017; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1018; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1019; GFX12-NEXT:    s_wait_kmcnt 0x0
1020; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
1021; GFX12-NEXT:    s_wait_loadcnt 0x0
1022; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
1023; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1024; GFX12-NEXT:    s_endpgm
1025  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1026  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
1027  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
1028  %a = load double, ptr addrspace(1) %gep0
1029  %fabs.a = call double @llvm.fabs.f64(double %a)
1030  %fneg.fabs.a = fneg double %fabs.a
1031
1032  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
1033  %med = call double @llvm.minnum.f64(double %max, double 1.0)
1034
1035  store double %med, ptr addrspace(1) %out.gep
1036  ret void
1037}
1038
1039define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1040; GFX6-LABEL: v_clamp_med3_aby_negzero_f32:
1041; GFX6:       ; %bb.0:
1042; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1043; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1044; GFX6-NEXT:    s_mov_b32 s6, 0
1045; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1046; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1047; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1049; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1050; GFX6-NEXT:    s_brev_b32 s4, 1
1051; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1052; GFX6-NEXT:    s_waitcnt vmcnt(0)
1053; GFX6-NEXT:    v_med3_f32 v2, s4, 1.0, v2
1054; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1055; GFX6-NEXT:    s_endpgm
1056;
1057; GFX8-LABEL: v_clamp_med3_aby_negzero_f32:
1058; GFX8:       ; %bb.0:
1059; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1060; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1061; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1062; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1063; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1064; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1065; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1066; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1067; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1068; GFX8-NEXT:    s_brev_b32 s0, 1
1069; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1070; GFX8-NEXT:    s_waitcnt vmcnt(0)
1071; GFX8-NEXT:    v_med3_f32 v2, s0, 1.0, v3
1072; GFX8-NEXT:    flat_store_dword v[0:1], v2
1073; GFX8-NEXT:    s_endpgm
1074;
1075; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
1076; GFX9:       ; %bb.0:
1077; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1078; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1079; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1080; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1081; GFX9-NEXT:    s_brev_b32 s2, 1
1082; GFX9-NEXT:    s_waitcnt vmcnt(0)
1083; GFX9-NEXT:    v_med3_f32 v1, s2, 1.0, v1
1084; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1085; GFX9-NEXT:    s_endpgm
1086;
1087; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
1088; GFX11:       ; %bb.0:
1089; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1090; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1091; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1092; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1093; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1095; GFX11-NEXT:    s_waitcnt vmcnt(0)
1096; GFX11-NEXT:    v_med3_f32 v1, 0x80000000, 1.0, v1
1097; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1098; GFX11-NEXT:    s_endpgm
1099;
1100; GFX12-LABEL: v_clamp_med3_aby_negzero_f32:
1101; GFX12:       ; %bb.0:
1102; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1103; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1104; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1105; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1106; GFX12-NEXT:    s_wait_kmcnt 0x0
1107; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1108; GFX12-NEXT:    s_wait_loadcnt 0x0
1109; GFX12-NEXT:    v_med3_num_f32 v1, 0x80000000, 1.0, v1
1110; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1111; GFX12-NEXT:    s_endpgm
1112  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1113  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1114  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1115  %a = load float, ptr addrspace(1) %gep0
1116  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
1117  store float %med, ptr addrspace(1) %out.gep
1118  ret void
1119}
1120
1121define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1122; GFX6-LABEL: v_clamp_med3_aby_f32:
1123; GFX6:       ; %bb.0:
1124; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1125; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1126; GFX6-NEXT:    s_mov_b32 s6, 0
1127; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1128; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1129; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1131; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1132; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1133; GFX6-NEXT:    s_waitcnt vmcnt(0)
1134; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1135; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1136; GFX6-NEXT:    s_endpgm
1137;
1138; GFX8-LABEL: v_clamp_med3_aby_f32:
1139; GFX8:       ; %bb.0:
1140; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1141; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1142; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1143; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1144; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1145; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1146; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1147; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1148; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1149; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1150; GFX8-NEXT:    s_waitcnt vmcnt(0)
1151; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1152; GFX8-NEXT:    flat_store_dword v[0:1], v2
1153; GFX8-NEXT:    s_endpgm
1154;
1155; GFX9-LABEL: v_clamp_med3_aby_f32:
1156; GFX9:       ; %bb.0:
1157; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1158; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1159; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1161; GFX9-NEXT:    s_waitcnt vmcnt(0)
1162; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1163; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1164; GFX9-NEXT:    s_endpgm
1165;
1166; GFX11-LABEL: v_clamp_med3_aby_f32:
1167; GFX11:       ; %bb.0:
1168; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1169; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1171; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1172; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1174; GFX11-NEXT:    s_waitcnt vmcnt(0)
1175; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1176; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1177; GFX11-NEXT:    s_endpgm
1178;
1179; GFX12-LABEL: v_clamp_med3_aby_f32:
1180; GFX12:       ; %bb.0:
1181; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1182; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1183; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1184; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1185; GFX12-NEXT:    s_wait_kmcnt 0x0
1186; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1187; GFX12-NEXT:    s_wait_loadcnt 0x0
1188; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
1189; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1190; GFX12-NEXT:    s_endpgm
1191  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1192  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1193  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1194  %a = load float, ptr addrspace(1) %gep0
1195  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
1196  store float %med, ptr addrspace(1) %out.gep
1197  ret void
1198}
1199
1200define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1201; GFX6-LABEL: v_clamp_med3_bay_f32:
1202; GFX6:       ; %bb.0:
1203; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1204; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1205; GFX6-NEXT:    s_mov_b32 s6, 0
1206; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1207; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1208; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1210; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1211; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1212; GFX6-NEXT:    s_waitcnt vmcnt(0)
1213; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1214; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1215; GFX6-NEXT:    s_endpgm
1216;
1217; GFX8-LABEL: v_clamp_med3_bay_f32:
1218; GFX8:       ; %bb.0:
1219; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1220; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1221; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1223; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1224; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1225; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1226; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1227; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1228; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1229; GFX8-NEXT:    s_waitcnt vmcnt(0)
1230; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1231; GFX8-NEXT:    flat_store_dword v[0:1], v2
1232; GFX8-NEXT:    s_endpgm
1233;
1234; GFX9-LABEL: v_clamp_med3_bay_f32:
1235; GFX9:       ; %bb.0:
1236; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1237; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1238; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1239; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1240; GFX9-NEXT:    s_waitcnt vmcnt(0)
1241; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1242; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1243; GFX9-NEXT:    s_endpgm
1244;
1245; GFX11-LABEL: v_clamp_med3_bay_f32:
1246; GFX11:       ; %bb.0:
1247; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1248; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1249; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1250; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1251; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1253; GFX11-NEXT:    s_waitcnt vmcnt(0)
1254; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1255; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1256; GFX11-NEXT:    s_endpgm
1257;
1258; GFX12-LABEL: v_clamp_med3_bay_f32:
1259; GFX12:       ; %bb.0:
1260; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1261; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1262; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1263; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1264; GFX12-NEXT:    s_wait_kmcnt 0x0
1265; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1266; GFX12-NEXT:    s_wait_loadcnt 0x0
1267; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
1268; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1269; GFX12-NEXT:    s_endpgm
1270  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1271  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1272  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1273  %a = load float, ptr addrspace(1) %gep0
1274  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
1275  store float %med, ptr addrspace(1) %out.gep
1276  ret void
1277}
1278
1279define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1280; GFX6-LABEL: v_clamp_med3_yab_f32:
1281; GFX6:       ; %bb.0:
1282; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1283; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1284; GFX6-NEXT:    s_mov_b32 s6, 0
1285; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1286; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1287; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1289; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1290; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1291; GFX6-NEXT:    s_waitcnt vmcnt(0)
1292; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1293; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1294; GFX6-NEXT:    s_endpgm
1295;
1296; GFX8-LABEL: v_clamp_med3_yab_f32:
1297; GFX8:       ; %bb.0:
1298; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1299; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1300; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1301; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1302; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1303; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1304; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1305; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1306; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1307; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1308; GFX8-NEXT:    s_waitcnt vmcnt(0)
1309; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1310; GFX8-NEXT:    flat_store_dword v[0:1], v2
1311; GFX8-NEXT:    s_endpgm
1312;
1313; GFX9-LABEL: v_clamp_med3_yab_f32:
1314; GFX9:       ; %bb.0:
1315; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1316; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1317; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1318; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1319; GFX9-NEXT:    s_waitcnt vmcnt(0)
1320; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1321; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1322; GFX9-NEXT:    s_endpgm
1323;
1324; GFX11-LABEL: v_clamp_med3_yab_f32:
1325; GFX11:       ; %bb.0:
1326; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1327; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1328; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1329; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1330; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1332; GFX11-NEXT:    s_waitcnt vmcnt(0)
1333; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1334; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1335; GFX11-NEXT:    s_endpgm
1336;
1337; GFX12-LABEL: v_clamp_med3_yab_f32:
1338; GFX12:       ; %bb.0:
1339; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1340; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1341; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1342; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1343; GFX12-NEXT:    s_wait_kmcnt 0x0
1344; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1345; GFX12-NEXT:    s_wait_loadcnt 0x0
1346; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
1347; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1348; GFX12-NEXT:    s_endpgm
1349  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1350  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1351  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1352  %a = load float, ptr addrspace(1) %gep0
1353  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
1354  store float %med, ptr addrspace(1) %out.gep
1355  ret void
1356}
1357
1358define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1359; GFX6-LABEL: v_clamp_med3_yba_f32:
1360; GFX6:       ; %bb.0:
1361; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1362; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1363; GFX6-NEXT:    s_mov_b32 s6, 0
1364; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1365; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1366; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1368; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1369; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1370; GFX6-NEXT:    s_waitcnt vmcnt(0)
1371; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1372; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1373; GFX6-NEXT:    s_endpgm
1374;
1375; GFX8-LABEL: v_clamp_med3_yba_f32:
1376; GFX8:       ; %bb.0:
1377; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1378; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1379; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1380; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1381; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1382; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1383; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1384; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1385; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1386; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1387; GFX8-NEXT:    s_waitcnt vmcnt(0)
1388; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1389; GFX8-NEXT:    flat_store_dword v[0:1], v2
1390; GFX8-NEXT:    s_endpgm
1391;
1392; GFX9-LABEL: v_clamp_med3_yba_f32:
1393; GFX9:       ; %bb.0:
1394; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1395; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1396; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1398; GFX9-NEXT:    s_waitcnt vmcnt(0)
1399; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1400; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1401; GFX9-NEXT:    s_endpgm
1402;
1403; GFX11-LABEL: v_clamp_med3_yba_f32:
1404; GFX11:       ; %bb.0:
1405; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1406; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1407; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1408; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1409; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1410; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1411; GFX11-NEXT:    s_waitcnt vmcnt(0)
1412; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1413; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1414; GFX11-NEXT:    s_endpgm
1415;
1416; GFX12-LABEL: v_clamp_med3_yba_f32:
1417; GFX12:       ; %bb.0:
1418; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1419; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1420; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1421; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1422; GFX12-NEXT:    s_wait_kmcnt 0x0
1423; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1424; GFX12-NEXT:    s_wait_loadcnt 0x0
1425; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
1426; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1427; GFX12-NEXT:    s_endpgm
1428  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1429  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1430  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1431  %a = load float, ptr addrspace(1) %gep0
1432  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
1433  store float %med, ptr addrspace(1) %out.gep
1434  ret void
1435}
1436
1437define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1438; GFX6-LABEL: v_clamp_med3_ayb_f32:
1439; GFX6:       ; %bb.0:
1440; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1441; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1442; GFX6-NEXT:    s_mov_b32 s6, 0
1443; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1444; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1445; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1446; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1447; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1448; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1449; GFX6-NEXT:    s_waitcnt vmcnt(0)
1450; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1451; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1452; GFX6-NEXT:    s_endpgm
1453;
1454; GFX8-LABEL: v_clamp_med3_ayb_f32:
1455; GFX8:       ; %bb.0:
1456; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1457; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1458; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1459; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1460; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1461; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1462; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1463; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1464; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1465; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1466; GFX8-NEXT:    s_waitcnt vmcnt(0)
1467; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1468; GFX8-NEXT:    flat_store_dword v[0:1], v2
1469; GFX8-NEXT:    s_endpgm
1470;
1471; GFX9-LABEL: v_clamp_med3_ayb_f32:
1472; GFX9:       ; %bb.0:
1473; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1474; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1476; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1477; GFX9-NEXT:    s_waitcnt vmcnt(0)
1478; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1479; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1480; GFX9-NEXT:    s_endpgm
1481;
1482; GFX11-LABEL: v_clamp_med3_ayb_f32:
1483; GFX11:       ; %bb.0:
1484; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1485; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1486; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1487; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1488; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1489; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1490; GFX11-NEXT:    s_waitcnt vmcnt(0)
1491; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1492; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1493; GFX11-NEXT:    s_endpgm
1494;
1495; GFX12-LABEL: v_clamp_med3_ayb_f32:
1496; GFX12:       ; %bb.0:
1497; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1498; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1499; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1500; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1501; GFX12-NEXT:    s_wait_kmcnt 0x0
1502; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1503; GFX12-NEXT:    s_wait_loadcnt 0x0
1504; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
1505; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1506; GFX12-NEXT:    s_endpgm
1507  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1508  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1509  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1510  %a = load float, ptr addrspace(1) %gep0
1511  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
1512  store float %med, ptr addrspace(1) %out.gep
1513  ret void
1514}
1515
1516define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1517; GFX6-LABEL: v_clamp_med3_bya_f32:
1518; GFX6:       ; %bb.0:
1519; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1520; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1521; GFX6-NEXT:    s_mov_b32 s6, 0
1522; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1523; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1524; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1526; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1527; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1528; GFX6-NEXT:    s_waitcnt vmcnt(0)
1529; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1530; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1531; GFX6-NEXT:    s_endpgm
1532;
1533; GFX8-LABEL: v_clamp_med3_bya_f32:
1534; GFX8:       ; %bb.0:
1535; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1536; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1537; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1539; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1540; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1541; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1542; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1543; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1544; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1545; GFX8-NEXT:    s_waitcnt vmcnt(0)
1546; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1547; GFX8-NEXT:    flat_store_dword v[0:1], v2
1548; GFX8-NEXT:    s_endpgm
1549;
1550; GFX9-LABEL: v_clamp_med3_bya_f32:
1551; GFX9:       ; %bb.0:
1552; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1553; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1554; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1555; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1556; GFX9-NEXT:    s_waitcnt vmcnt(0)
1557; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1558; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1559; GFX9-NEXT:    s_endpgm
1560;
1561; GFX11-LABEL: v_clamp_med3_bya_f32:
1562; GFX11:       ; %bb.0:
1563; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1564; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1565; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1566; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1567; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1569; GFX11-NEXT:    s_waitcnt vmcnt(0)
1570; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1571; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1572; GFX11-NEXT:    s_endpgm
1573;
1574; GFX12-LABEL: v_clamp_med3_bya_f32:
1575; GFX12:       ; %bb.0:
1576; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1577; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1578; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1579; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1580; GFX12-NEXT:    s_wait_kmcnt 0x0
1581; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
1582; GFX12-NEXT:    s_wait_loadcnt 0x0
1583; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
1584; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1585; GFX12-NEXT:    s_endpgm
1586  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1587  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1588  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1589  %a = load float, ptr addrspace(1) %gep0
1590  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
1591  store float %med, ptr addrspace(1) %out.gep
1592  ret void
1593}
1594
1595define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 {
1596; GFX6-LABEL: v_clamp_constants_to_one_f32:
1597; GFX6:       ; %bb.0:
1598; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1599; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1600; GFX6-NEXT:    s_mov_b32 s2, 0
1601; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1602; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1603; GFX6-NEXT:    v_mov_b32_e32 v2, 1.0
1604; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1605; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1606; GFX6-NEXT:    s_endpgm
1607;
1608; GFX8-LABEL: v_clamp_constants_to_one_f32:
1609; GFX8:       ; %bb.0:
1610; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1611; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1612; GFX8-NEXT:    v_mov_b32_e32 v2, 1.0
1613; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1615; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1616; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1617; GFX8-NEXT:    flat_store_dword v[0:1], v2
1618; GFX8-NEXT:    s_endpgm
1619;
1620; GFX9-LABEL: v_clamp_constants_to_one_f32:
1621; GFX9:       ; %bb.0:
1622; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1623; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1624; GFX9-NEXT:    v_mov_b32_e32 v1, 1.0
1625; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1626; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1627; GFX9-NEXT:    s_endpgm
1628;
1629; GFX11-LABEL: v_clamp_constants_to_one_f32:
1630; GFX11:       ; %bb.0:
1631; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1632; GFX11-NEXT:    v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0
1633; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1634; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1635; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1637; GFX11-NEXT:    s_endpgm
1638;
1639; GFX12-LABEL: v_clamp_constants_to_one_f32:
1640; GFX12:       ; %bb.0:
1641; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1642; GFX12-NEXT:    v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0
1643; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1644; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1645; GFX12-NEXT:    s_wait_kmcnt 0x0
1646; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1647; GFX12-NEXT:    s_endpgm
1648  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1649  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1650  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
1651  store float %med, ptr addrspace(1) %out.gep
1652  ret void
1653}
1654
1655define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 {
1656; GFX6-LABEL: v_clamp_constants_to_zero_f32:
1657; GFX6:       ; %bb.0:
1658; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1659; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1660; GFX6-NEXT:    s_mov_b32 s2, 0
1661; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1662; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1663; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1664; GFX6-NEXT:    buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1665; GFX6-NEXT:    s_endpgm
1666;
1667; GFX8-LABEL: v_clamp_constants_to_zero_f32:
1668; GFX8:       ; %bb.0:
1669; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1670; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1671; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1672; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1673; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1674; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1675; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1676; GFX8-NEXT:    flat_store_dword v[0:1], v2
1677; GFX8-NEXT:    s_endpgm
1678;
1679; GFX9-LABEL: v_clamp_constants_to_zero_f32:
1680; GFX9:       ; %bb.0:
1681; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1682; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1683; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1685; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1686; GFX9-NEXT:    s_endpgm
1687;
1688; GFX11-LABEL: v_clamp_constants_to_zero_f32:
1689; GFX11:       ; %bb.0:
1690; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1691; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1692; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1693; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1694; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1696; GFX11-NEXT:    s_endpgm
1697;
1698; GFX12-LABEL: v_clamp_constants_to_zero_f32:
1699; GFX12:       ; %bb.0:
1700; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1701; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1702; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1703; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1704; GFX12-NEXT:    s_wait_kmcnt 0x0
1705; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1706; GFX12-NEXT:    s_endpgm
1707  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1708  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1709  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
1710  store float %med, ptr addrspace(1) %out.gep
1711  ret void
1712}
1713
1714define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 {
1715; GFX6-LABEL: v_clamp_constant_preserve_f32:
1716; GFX6:       ; %bb.0:
1717; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1718; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1719; GFX6-NEXT:    s_mov_b32 s2, 0
1720; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1721; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1722; GFX6-NEXT:    v_mov_b32_e32 v2, 0.5
1723; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1724; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1725; GFX6-NEXT:    s_endpgm
1726;
1727; GFX8-LABEL: v_clamp_constant_preserve_f32:
1728; GFX8:       ; %bb.0:
1729; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1730; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1731; GFX8-NEXT:    v_mov_b32_e32 v2, 0.5
1732; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1733; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1734; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1735; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1736; GFX8-NEXT:    flat_store_dword v[0:1], v2
1737; GFX8-NEXT:    s_endpgm
1738;
1739; GFX9-LABEL: v_clamp_constant_preserve_f32:
1740; GFX9:       ; %bb.0:
1741; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1742; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1743; GFX9-NEXT:    v_mov_b32_e32 v1, 0.5
1744; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1745; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1746; GFX9-NEXT:    s_endpgm
1747;
1748; GFX11-LABEL: v_clamp_constant_preserve_f32:
1749; GFX11:       ; %bb.0:
1750; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1751; GFX11-NEXT:    v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0
1752; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1753; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1754; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1756; GFX11-NEXT:    s_endpgm
1757;
1758; GFX12-LABEL: v_clamp_constant_preserve_f32:
1759; GFX12:       ; %bb.0:
1760; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1761; GFX12-NEXT:    v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0
1762; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1763; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1764; GFX12-NEXT:    s_wait_kmcnt 0x0
1765; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1766; GFX12-NEXT:    s_endpgm
1767  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1768  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1769  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
1770  store float %med, ptr addrspace(1) %out.gep
1771  ret void
1772}
1773
1774define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 {
1775; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32:
1776; GFX6:       ; %bb.0:
1777; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1778; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1779; GFX6-NEXT:    s_mov_b32 s2, 0
1780; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1781; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1782; GFX6-NEXT:    v_mov_b32_e32 v2, 0x7fffff
1783; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1785; GFX6-NEXT:    s_endpgm
1786;
1787; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32:
1788; GFX8:       ; %bb.0:
1789; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1790; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1791; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fffff
1792; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1794; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1795; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1796; GFX8-NEXT:    flat_store_dword v[0:1], v2
1797; GFX8-NEXT:    s_endpgm
1798;
1799; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32:
1800; GFX9:       ; %bb.0:
1801; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1802; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1803; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fffff
1804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1805; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1806; GFX9-NEXT:    s_endpgm
1807;
1808; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32:
1809; GFX11:       ; %bb.0:
1810; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1811; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1812; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1813; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
1814; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1815; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1816; GFX11-NEXT:    s_endpgm
1817;
1818; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32:
1819; GFX12:       ; %bb.0:
1820; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1821; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1822; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1823; GFX12-NEXT:    v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
1824; GFX12-NEXT:    s_wait_kmcnt 0x0
1825; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1826; GFX12-NEXT:    s_endpgm
1827  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1828  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1829  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
1830  store float %med, ptr addrspace(1) %out.gep
1831  ret void
1832}
1833
1834define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
1835; GFX6-LABEL: v_clamp_constant_qnan_f32:
1836; GFX6:       ; %bb.0:
1837; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1838; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1839; GFX6-NEXT:    s_mov_b32 s2, 0
1840; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1841; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1842; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1843; GFX6-NEXT:    buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1844; GFX6-NEXT:    s_endpgm
1845;
1846; GFX8-LABEL: v_clamp_constant_qnan_f32:
1847; GFX8:       ; %bb.0:
1848; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1849; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1850; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1851; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1852; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1853; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1854; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1855; GFX8-NEXT:    flat_store_dword v[0:1], v2
1856; GFX8-NEXT:    s_endpgm
1857;
1858; GFX9-LABEL: v_clamp_constant_qnan_f32:
1859; GFX9:       ; %bb.0:
1860; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1861; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1862; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1863; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1864; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1865; GFX9-NEXT:    s_endpgm
1866;
1867; GFX11-LABEL: v_clamp_constant_qnan_f32:
1868; GFX11:       ; %bb.0:
1869; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1870; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1871; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1872; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1873; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1874; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1875; GFX11-NEXT:    s_endpgm
1876;
1877; GFX12-LABEL: v_clamp_constant_qnan_f32:
1878; GFX12:       ; %bb.0:
1879; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1880; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1881; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1882; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1883; GFX12-NEXT:    s_wait_kmcnt 0x0
1884; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1885; GFX12-NEXT:    s_endpgm
1886  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1887  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1888  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
1889  store float %med, ptr addrspace(1) %out.gep
1890  ret void
1891}
1892
1893define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
1894; GFX6-LABEL: v_clamp_constant_snan_f32:
1895; GFX6:       ; %bb.0:
1896; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1897; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1898; GFX6-NEXT:    s_mov_b32 s2, 0
1899; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1900; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1901; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1902; GFX6-NEXT:    buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1903; GFX6-NEXT:    s_endpgm
1904;
1905; GFX8-LABEL: v_clamp_constant_snan_f32:
1906; GFX8:       ; %bb.0:
1907; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1908; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1909; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1910; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1911; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1912; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1913; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1914; GFX8-NEXT:    flat_store_dword v[0:1], v2
1915; GFX8-NEXT:    s_endpgm
1916;
1917; GFX9-LABEL: v_clamp_constant_snan_f32:
1918; GFX9:       ; %bb.0:
1919; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1920; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1921; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1922; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1923; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1924; GFX9-NEXT:    s_endpgm
1925;
1926; GFX11-LABEL: v_clamp_constant_snan_f32:
1927; GFX11:       ; %bb.0:
1928; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1929; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1930; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1931; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1932; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1933; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1934; GFX11-NEXT:    s_endpgm
1935;
1936; GFX12-LABEL: v_clamp_constant_snan_f32:
1937; GFX12:       ; %bb.0:
1938; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1939; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1940; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1941; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1942; GFX12-NEXT:    s_wait_kmcnt 0x0
1943; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1944; GFX12-NEXT:    s_endpgm
1945  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1946  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1947  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
1948  store float %med, ptr addrspace(1) %out.gep
1949  ret void
1950}
1951
1952; ---------------------------------------------------------------------
1953; Test non-default behaviors enabling snans and disabling dx10_clamp
1954; ---------------------------------------------------------------------
1955
1956define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
1957; GFX6-LABEL: v_clamp_f32_no_dx10_clamp:
1958; GFX6:       ; %bb.0:
1959; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1960; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1961; GFX6-NEXT:    s_mov_b32 s6, 0
1962; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1963; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1964; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1965; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1966; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1967; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1968; GFX6-NEXT:    s_waitcnt vmcnt(0)
1969; GFX6-NEXT:    v_add_f32_e32 v2, 0.5, v2
1970; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1971; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1972; GFX6-NEXT:    s_endpgm
1973;
1974; GFX8-LABEL: v_clamp_f32_no_dx10_clamp:
1975; GFX8:       ; %bb.0:
1976; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1977; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1978; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1980; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1981; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1982; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1983; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1984; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1985; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1986; GFX8-NEXT:    s_waitcnt vmcnt(0)
1987; GFX8-NEXT:    v_add_f32_e32 v2, 0.5, v3
1988; GFX8-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1989; GFX8-NEXT:    flat_store_dword v[0:1], v2
1990; GFX8-NEXT:    s_endpgm
1991;
1992; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
1993; GFX9:       ; %bb.0:
1994; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1995; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1996; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1997; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1998; GFX9-NEXT:    s_waitcnt vmcnt(0)
1999; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
2000; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2001; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2002; GFX9-NEXT:    s_endpgm
2003;
2004; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
2005; GFX11:       ; %bb.0:
2006; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2007; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2008; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2009; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2010; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2011; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2012; GFX11-NEXT:    s_waitcnt vmcnt(0)
2013; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
2014; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2015; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2016; GFX11-NEXT:    s_endpgm
2017;
2018; GFX12-LABEL: v_clamp_f32_no_dx10_clamp:
2019; GFX12:       ; %bb.0:
2020; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2021; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2022; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2023; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2024; GFX12-NEXT:    s_wait_kmcnt 0x0
2025; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2026; GFX12-NEXT:    s_wait_loadcnt 0x0
2027; GFX12-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
2028; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2029; GFX12-NEXT:    s_endpgm
2030  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2031  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2032  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2033  %a = load float, ptr addrspace(1) %gep0
2034  %a.nnan = fadd nnan float %a, 0.5
2035  %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
2036  %med = call float @llvm.minnum.f32(float %max, float 1.0)
2037
2038  store float %med, ptr addrspace(1) %out.gep
2039  ret void
2040}
2041
2042define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 {
2043; GFX6-LABEL: v_clamp_f32_snan_dx10clamp:
2044; GFX6:       ; %bb.0:
2045; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2046; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2047; GFX6-NEXT:    s_mov_b32 s6, 0
2048; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2049; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2050; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2051; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2052; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2053; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2054; GFX6-NEXT:    s_waitcnt vmcnt(0)
2055; GFX6-NEXT:    v_add_f32_e64 v2, v2, 0.5 clamp
2056; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2057; GFX6-NEXT:    s_endpgm
2058;
2059; GFX8-LABEL: v_clamp_f32_snan_dx10clamp:
2060; GFX8:       ; %bb.0:
2061; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2062; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2063; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2065; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2066; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2067; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2068; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2069; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2070; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2071; GFX8-NEXT:    s_waitcnt vmcnt(0)
2072; GFX8-NEXT:    v_add_f32_e64 v2, v3, 0.5 clamp
2073; GFX8-NEXT:    flat_store_dword v[0:1], v2
2074; GFX8-NEXT:    s_endpgm
2075;
2076; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
2077; GFX9:       ; %bb.0:
2078; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2079; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2080; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2081; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2082; GFX9-NEXT:    s_waitcnt vmcnt(0)
2083; GFX9-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
2084; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2085; GFX9-NEXT:    s_endpgm
2086;
2087; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
2088; GFX11:       ; %bb.0:
2089; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2090; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2091; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2092; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2093; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2094; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2095; GFX11-NEXT:    s_waitcnt vmcnt(0)
2096; GFX11-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
2097; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2098; GFX11-NEXT:    s_endpgm
2099;
2100; GFX12-LABEL: v_clamp_f32_snan_dx10clamp:
2101; GFX12:       ; %bb.0:
2102; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2103; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2104; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2105; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2106; GFX12-NEXT:    s_wait_kmcnt 0x0
2107; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2108; GFX12-NEXT:    s_wait_loadcnt 0x0
2109; GFX12-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
2110; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2111; GFX12-NEXT:    s_endpgm
2112  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2113  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2114  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2115  %a = load float, ptr addrspace(1) %gep0
2116  %add = fadd float %a, 0.5
2117  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
2118  %med = call float @llvm.minnum.f32(float %max, float 1.0)
2119
2120  store float %med, ptr addrspace(1) %out.gep
2121  ret void
2122}
2123
2124define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
2125; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp:
2126; GFX6:       ; %bb.0:
2127; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2128; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2129; GFX6-NEXT:    s_mov_b32 s6, 0
2130; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2131; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2132; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2133; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2134; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2135; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2136; GFX6-NEXT:    s_waitcnt vmcnt(0)
2137; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2138; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
2139; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2140; GFX6-NEXT:    s_endpgm
2141;
2142; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp:
2143; GFX8:       ; %bb.0:
2144; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2145; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2146; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2147; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2148; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2149; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2150; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2151; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2152; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2153; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2154; GFX8-NEXT:    s_waitcnt vmcnt(0)
2155; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
2156; GFX8-NEXT:    v_med3_f32 v2, v2, 0, 1.0
2157; GFX8-NEXT:    flat_store_dword v[0:1], v2
2158; GFX8-NEXT:    s_endpgm
2159;
2160; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
2161; GFX9:       ; %bb.0:
2162; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2163; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2166; GFX9-NEXT:    s_waitcnt vmcnt(0)
2167; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2168; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2169; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2170; GFX9-NEXT:    s_endpgm
2171;
2172; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
2173; GFX11:       ; %bb.0:
2174; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2175; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2176; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2177; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2178; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2179; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2180; GFX11-NEXT:    s_waitcnt vmcnt(0)
2181; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
2182; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2183; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2184; GFX11-NEXT:    s_endpgm
2185;
2186; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp:
2187; GFX12:       ; %bb.0:
2188; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2189; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2190; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2191; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2192; GFX12-NEXT:    s_wait_kmcnt 0x0
2193; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2194; GFX12-NEXT:    s_wait_loadcnt 0x0
2195; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2196; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2197; GFX12-NEXT:    s_endpgm
2198  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2199  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2200  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2201  %a = load float, ptr addrspace(1) %gep0
2202  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
2203  %med = call float @llvm.minnum.f32(float %max, float 1.0)
2204
2205  store float %med, ptr addrspace(1) %out.gep
2206  ret void
2207}
2208
2209define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
2210; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
2211; GFX6:       ; %bb.0:
2212; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2213; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2214; GFX6-NEXT:    s_mov_b32 s6, 0
2215; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2216; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2217; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2218; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2219; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2220; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2221; GFX6-NEXT:    s_waitcnt vmcnt(0)
2222; GFX6-NEXT:    v_add_f32_e32 v2, 1.0, v2
2223; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
2224; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2225; GFX6-NEXT:    s_endpgm
2226;
2227; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
2228; GFX8:       ; %bb.0:
2229; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2230; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2231; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2233; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2234; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2235; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2236; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2237; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2238; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2239; GFX8-NEXT:    s_waitcnt vmcnt(0)
2240; GFX8-NEXT:    v_add_f32_e32 v2, 1.0, v3
2241; GFX8-NEXT:    v_med3_f32 v2, v2, 0, 1.0
2242; GFX8-NEXT:    flat_store_dword v[0:1], v2
2243; GFX8-NEXT:    s_endpgm
2244;
2245; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
2246; GFX9:       ; %bb.0:
2247; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2248; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2249; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2250; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2251; GFX9-NEXT:    s_waitcnt vmcnt(0)
2252; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
2253; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2254; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2255; GFX9-NEXT:    s_endpgm
2256;
2257; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
2258; GFX11:       ; %bb.0:
2259; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2260; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2261; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2262; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2263; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2264; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2265; GFX11-NEXT:    s_waitcnt vmcnt(0)
2266; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
2267; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2268; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2269; GFX11-NEXT:    s_endpgm
2270;
2271; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
2272; GFX12:       ; %bb.0:
2273; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2274; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2275; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2276; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2277; GFX12-NEXT:    s_wait_kmcnt 0x0
2278; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2279; GFX12-NEXT:    s_wait_loadcnt 0x0
2280; GFX12-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
2281; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2282; GFX12-NEXT:    s_endpgm
2283  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2284  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2285  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2286  %a = load float, ptr addrspace(1) %gep0
2287  %add  = fadd nnan float %a, 1.0
2288  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
2289  %med = call float @llvm.minnum.f32(float %max, float 1.0)
2290
2291  store float %med, ptr addrspace(1) %out.gep
2292  ret void
2293}
2294
2295define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2296; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
2297; GFX6:       ; %bb.0:
2298; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2299; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2300; GFX6-NEXT:    s_mov_b32 s6, 0
2301; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2302; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2303; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2304; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2305; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2306; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2307; GFX6-NEXT:    s_waitcnt vmcnt(0)
2308; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
2309; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2310; GFX6-NEXT:    s_endpgm
2311;
2312; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
2313; GFX8:       ; %bb.0:
2314; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2315; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2316; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2318; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2319; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2320; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2321; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2322; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2323; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2324; GFX8-NEXT:    s_waitcnt vmcnt(0)
2325; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
2326; GFX8-NEXT:    flat_store_dword v[0:1], v2
2327; GFX8-NEXT:    s_endpgm
2328;
2329; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
2330; GFX9:       ; %bb.0:
2331; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2332; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2334; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2335; GFX9-NEXT:    s_waitcnt vmcnt(0)
2336; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
2337; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2338; GFX9-NEXT:    s_endpgm
2339;
2340; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
2341; GFX11:       ; %bb.0:
2342; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2343; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2344; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2345; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2346; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2347; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2348; GFX11-NEXT:    s_waitcnt vmcnt(0)
2349; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
2350; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2351; GFX11-NEXT:    s_endpgm
2352;
2353; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
2354; GFX12:       ; %bb.0:
2355; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2356; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2357; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2358; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2359; GFX12-NEXT:    s_wait_kmcnt 0x0
2360; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2361; GFX12-NEXT:    s_wait_loadcnt 0x0
2362; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2363; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2364; GFX12-NEXT:    s_endpgm
2365  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2366  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2367  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2368  %a = load float, ptr addrspace(1) %gep0
2369  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
2370  store float %med, ptr addrspace(1) %out.gep
2371  ret void
2372}
2373
2374define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2375; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2376; GFX6:       ; %bb.0:
2377; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2378; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2379; GFX6-NEXT:    s_mov_b32 s6, 0
2380; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2381; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2382; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2383; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2384; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2385; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2386; GFX6-NEXT:    s_waitcnt vmcnt(0)
2387; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
2388; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2389; GFX6-NEXT:    s_endpgm
2390;
2391; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2392; GFX8:       ; %bb.0:
2393; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2394; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2395; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2396; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2397; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2398; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2399; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2400; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2401; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2402; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2403; GFX8-NEXT:    s_waitcnt vmcnt(0)
2404; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
2405; GFX8-NEXT:    flat_store_dword v[0:1], v2
2406; GFX8-NEXT:    s_endpgm
2407;
2408; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2409; GFX9:       ; %bb.0:
2410; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2411; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2412; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2413; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2414; GFX9-NEXT:    s_waitcnt vmcnt(0)
2415; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
2416; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2417; GFX9-NEXT:    s_endpgm
2418;
2419; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2420; GFX11:       ; %bb.0:
2421; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2422; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2423; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2424; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2425; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2426; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2427; GFX11-NEXT:    s_waitcnt vmcnt(0)
2428; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
2429; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2430; GFX11-NEXT:    s_endpgm
2431;
2432; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2433; GFX12:       ; %bb.0:
2434; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2435; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2436; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2437; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2438; GFX12-NEXT:    s_wait_kmcnt 0x0
2439; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2440; GFX12-NEXT:    s_wait_loadcnt 0x0
2441; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2442; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2443; GFX12-NEXT:    s_endpgm
2444  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2445  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2446  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2447  %a = load float, ptr addrspace(1) %gep0
2448  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
2449  store float %med, ptr addrspace(1) %out.gep
2450  ret void
2451}
2452
2453define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2454; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2455; GFX6:       ; %bb.0:
2456; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2457; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2458; GFX6-NEXT:    s_mov_b32 s6, 0
2459; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2460; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2461; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2462; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2463; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2464; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2465; GFX6-NEXT:    s_waitcnt vmcnt(0)
2466; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
2467; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2468; GFX6-NEXT:    s_endpgm
2469;
2470; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2471; GFX8:       ; %bb.0:
2472; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2473; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2474; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2475; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2476; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2477; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2478; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2479; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2480; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2481; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2482; GFX8-NEXT:    s_waitcnt vmcnt(0)
2483; GFX8-NEXT:    v_med3_f32 v2, v3, 0, 1.0
2484; GFX8-NEXT:    flat_store_dword v[0:1], v2
2485; GFX8-NEXT:    s_endpgm
2486;
2487; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2488; GFX9:       ; %bb.0:
2489; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2490; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2491; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2492; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2493; GFX9-NEXT:    s_waitcnt vmcnt(0)
2494; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2495; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2496; GFX9-NEXT:    s_endpgm
2497;
2498; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2499; GFX11:       ; %bb.0:
2500; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2501; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2502; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2503; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2504; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2506; GFX11-NEXT:    s_waitcnt vmcnt(0)
2507; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2508; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2509; GFX11-NEXT:    s_endpgm
2510;
2511; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2512; GFX12:       ; %bb.0:
2513; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2514; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2515; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2516; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2517; GFX12-NEXT:    s_wait_kmcnt 0x0
2518; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2519; GFX12-NEXT:    s_wait_loadcnt 0x0
2520; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2521; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2522; GFX12-NEXT:    s_endpgm
2523  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2524  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2525  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2526  %a = load float, ptr addrspace(1) %gep0
2527  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
2528  store float %med, ptr addrspace(1) %out.gep
2529  ret void
2530}
2531
2532define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2533; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2534; GFX6:       ; %bb.0:
2535; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2536; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2537; GFX6-NEXT:    s_mov_b32 s6, 0
2538; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2539; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2540; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2541; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2542; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2543; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2544; GFX6-NEXT:    s_waitcnt vmcnt(0)
2545; GFX6-NEXT:    v_med3_f32 v2, v2, 1.0, 0
2546; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2547; GFX6-NEXT:    s_endpgm
2548;
2549; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2550; GFX8:       ; %bb.0:
2551; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2552; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2553; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2554; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2555; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2556; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2557; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2558; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2559; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2560; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2561; GFX8-NEXT:    s_waitcnt vmcnt(0)
2562; GFX8-NEXT:    v_med3_f32 v2, v3, 1.0, 0
2563; GFX8-NEXT:    flat_store_dword v[0:1], v2
2564; GFX8-NEXT:    s_endpgm
2565;
2566; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2567; GFX9:       ; %bb.0:
2568; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2569; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2570; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2571; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2572; GFX9-NEXT:    s_waitcnt vmcnt(0)
2573; GFX9-NEXT:    v_med3_f32 v1, v1, 1.0, 0
2574; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2575; GFX9-NEXT:    s_endpgm
2576;
2577; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2578; GFX11:       ; %bb.0:
2579; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2580; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2581; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2582; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2583; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2584; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2585; GFX11-NEXT:    s_waitcnt vmcnt(0)
2586; GFX11-NEXT:    v_med3_f32 v1, v1, 1.0, 0
2587; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2588; GFX11-NEXT:    s_endpgm
2589;
2590; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2591; GFX12:       ; %bb.0:
2592; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2593; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2594; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2595; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2596; GFX12-NEXT:    s_wait_kmcnt 0x0
2597; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2598; GFX12-NEXT:    s_wait_loadcnt 0x0
2599; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2600; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2601; GFX12-NEXT:    s_endpgm
2602  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2603  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2604  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2605  %a = load float, ptr addrspace(1) %gep0
2606  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
2607  store float %med, ptr addrspace(1) %out.gep
2608  ret void
2609}
2610
2611define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2612; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2613; GFX6:       ; %bb.0:
2614; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2615; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2616; GFX6-NEXT:    s_mov_b32 s6, 0
2617; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2618; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2619; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2620; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2621; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2622; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2623; GFX6-NEXT:    s_waitcnt vmcnt(0)
2624; GFX6-NEXT:    v_med3_f32 v2, 0, v2, 1.0
2625; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2626; GFX6-NEXT:    s_endpgm
2627;
2628; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2629; GFX8:       ; %bb.0:
2630; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2631; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2632; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2633; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2634; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2635; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2636; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2637; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2638; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2639; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2640; GFX8-NEXT:    s_waitcnt vmcnt(0)
2641; GFX8-NEXT:    v_med3_f32 v2, 0, v3, 1.0
2642; GFX8-NEXT:    flat_store_dword v[0:1], v2
2643; GFX8-NEXT:    s_endpgm
2644;
2645; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2646; GFX9:       ; %bb.0:
2647; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2648; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2649; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2650; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2651; GFX9-NEXT:    s_waitcnt vmcnt(0)
2652; GFX9-NEXT:    v_med3_f32 v1, 0, v1, 1.0
2653; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2654; GFX9-NEXT:    s_endpgm
2655;
2656; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2657; GFX11:       ; %bb.0:
2658; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2659; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2660; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2661; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2662; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2663; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2664; GFX11-NEXT:    s_waitcnt vmcnt(0)
2665; GFX11-NEXT:    v_med3_f32 v1, 0, v1, 1.0
2666; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2667; GFX11-NEXT:    s_endpgm
2668;
2669; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2670; GFX12:       ; %bb.0:
2671; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2672; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2673; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2674; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2675; GFX12-NEXT:    s_wait_kmcnt 0x0
2676; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2677; GFX12-NEXT:    s_wait_loadcnt 0x0
2678; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2679; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2680; GFX12-NEXT:    s_endpgm
2681  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2682  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2683  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2684  %a = load float, ptr addrspace(1) %gep0
2685  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
2686  store float %med, ptr addrspace(1) %out.gep
2687  ret void
2688}
2689
2690define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2691; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2692; GFX6:       ; %bb.0:
2693; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2694; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2695; GFX6-NEXT:    s_mov_b32 s6, 0
2696; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2697; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2698; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2699; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2700; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2701; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2702; GFX6-NEXT:    s_waitcnt vmcnt(0)
2703; GFX6-NEXT:    v_med3_f32 v2, 1.0, v2, 0
2704; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2705; GFX6-NEXT:    s_endpgm
2706;
2707; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2708; GFX8:       ; %bb.0:
2709; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2710; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2711; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2712; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2713; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2714; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2715; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2716; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2717; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2718; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2719; GFX8-NEXT:    s_waitcnt vmcnt(0)
2720; GFX8-NEXT:    v_med3_f32 v2, 1.0, v3, 0
2721; GFX8-NEXT:    flat_store_dword v[0:1], v2
2722; GFX8-NEXT:    s_endpgm
2723;
2724; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2725; GFX9:       ; %bb.0:
2726; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2727; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2728; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2729; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2730; GFX9-NEXT:    s_waitcnt vmcnt(0)
2731; GFX9-NEXT:    v_med3_f32 v1, 1.0, v1, 0
2732; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2733; GFX9-NEXT:    s_endpgm
2734;
2735; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2736; GFX11:       ; %bb.0:
2737; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2738; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2739; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2740; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2741; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2742; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2743; GFX11-NEXT:    s_waitcnt vmcnt(0)
2744; GFX11-NEXT:    v_med3_f32 v1, 1.0, v1, 0
2745; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2746; GFX11-NEXT:    s_endpgm
2747;
2748; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2749; GFX12:       ; %bb.0:
2750; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2751; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2752; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2753; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2754; GFX12-NEXT:    s_wait_kmcnt 0x0
2755; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2756; GFX12-NEXT:    s_wait_loadcnt 0x0
2757; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
2758; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2759; GFX12-NEXT:    s_endpgm
2760  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2761  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2762  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2763  %a = load float, ptr addrspace(1) %gep0
2764  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
2765  store float %med, ptr addrspace(1) %out.gep
2766  ret void
2767}
2768
2769define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 {
2770; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2771; GFX6:       ; %bb.0:
2772; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2773; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2774; GFX6-NEXT:    s_mov_b32 s2, 0
2775; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2776; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2777; GFX6-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
2778; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2779; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2780; GFX6-NEXT:    s_endpgm
2781;
2782; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2783; GFX8:       ; %bb.0:
2784; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2785; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2786; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
2787; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2788; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2789; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2790; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2791; GFX8-NEXT:    flat_store_dword v[0:1], v2
2792; GFX8-NEXT:    s_endpgm
2793;
2794; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2795; GFX9:       ; %bb.0:
2796; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2797; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2798; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
2799; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2800; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2801; GFX9-NEXT:    s_endpgm
2802;
2803; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2804; GFX11:       ; %bb.0:
2805; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2806; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2807; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2808; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
2809; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2810; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2811; GFX11-NEXT:    s_endpgm
2812;
2813; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2814; GFX12:       ; %bb.0:
2815; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2816; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2817; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2818; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2819; GFX12-NEXT:    s_wait_kmcnt 0x0
2820; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2821; GFX12-NEXT:    s_endpgm
2822  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2823  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2824  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
2825  store float %med, ptr addrspace(1) %out.gep
2826  ret void
2827}
2828
2829define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 {
2830; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2831; GFX6:       ; %bb.0:
2832; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2833; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2834; GFX6-NEXT:    s_mov_b32 s2, 0
2835; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2836; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2837; GFX6-NEXT:    v_mov_b32_e32 v2, 0x7f800001
2838; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2839; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2840; GFX6-NEXT:    s_endpgm
2841;
2842; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2843; GFX8:       ; %bb.0:
2844; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2845; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2846; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800001
2847; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2848; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2849; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2850; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2851; GFX8-NEXT:    flat_store_dword v[0:1], v2
2852; GFX8-NEXT:    s_endpgm
2853;
2854; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2855; GFX9:       ; %bb.0:
2856; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2857; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2858; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7f800001
2859; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2860; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2861; GFX9-NEXT:    s_endpgm
2862;
2863; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2864; GFX11:       ; %bb.0:
2865; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2866; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2867; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2868; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
2869; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2870; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2871; GFX11-NEXT:    s_endpgm
2872;
2873; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2874; GFX12:       ; %bb.0:
2875; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2876; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2877; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2878; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2879; GFX12-NEXT:    s_wait_kmcnt 0x0
2880; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2881; GFX12-NEXT:    s_endpgm
2882  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2883  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2884  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
2885  store float %med, ptr addrspace(1) %out.gep
2886  ret void
2887}
2888
2889define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2890; GFX6-LABEL: v_clamp_v2f16:
2891; GFX6:       ; %bb.0:
2892; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2893; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2894; GFX6-NEXT:    s_mov_b32 s6, 0
2895; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2896; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2897; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2898; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2899; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2900; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2901; GFX6-NEXT:    s_waitcnt vmcnt(0)
2902; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2903; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
2904; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
2905; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2906; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2907; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2908; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2909; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2910; GFX6-NEXT:    s_endpgm
2911;
2912; GFX8-LABEL: v_clamp_v2f16:
2913; GFX8:       ; %bb.0:
2914; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2915; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2916; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2917; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2918; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2919; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2920; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2921; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2922; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2923; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2924; GFX8-NEXT:    s_waitcnt vmcnt(0)
2925; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2926; GFX8-NEXT:    v_max_f16_e64 v3, v3, v3 clamp
2927; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2928; GFX8-NEXT:    flat_store_dword v[0:1], v2
2929; GFX8-NEXT:    s_endpgm
2930;
2931; GFX9-LABEL: v_clamp_v2f16:
2932; GFX9:       ; %bb.0:
2933; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2934; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2935; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2936; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2937; GFX9-NEXT:    s_waitcnt vmcnt(0)
2938; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
2939; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2940; GFX9-NEXT:    s_endpgm
2941;
2942; GFX11-LABEL: v_clamp_v2f16:
2943; GFX11:       ; %bb.0:
2944; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2945; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2946; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2947; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2948; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2949; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2950; GFX11-NEXT:    s_waitcnt vmcnt(0)
2951; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
2952; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2953; GFX11-NEXT:    s_endpgm
2954;
2955; GFX12-LABEL: v_clamp_v2f16:
2956; GFX12:       ; %bb.0:
2957; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2958; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2959; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2960; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2961; GFX12-NEXT:    s_wait_kmcnt 0x0
2962; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
2963; GFX12-NEXT:    s_wait_loadcnt 0x0
2964; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
2965; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2966; GFX12-NEXT:    s_endpgm
2967  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2968  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2969  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2970  %a = load <2 x half>, ptr addrspace(1) %gep0
2971  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
2972  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2973
2974  store <2 x half> %med, ptr addrspace(1) %out.gep
2975  ret void
2976}
2977
2978define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2979; GFX6-LABEL: v_clamp_v2f16_undef_elt:
2980; GFX6:       ; %bb.0:
2981; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2982; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2983; GFX6-NEXT:    s_mov_b32 s6, 0
2984; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2985; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2986; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2988; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2989; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
2990; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2991; GFX6-NEXT:    s_waitcnt vmcnt(0)
2992; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2993; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
2994; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
2995; GFX6-NEXT:    v_max_f32_e32 v2, 0x7fc00000, v2
2996; GFX6-NEXT:    v_med3_f32 v3, v3, 0, v4
2997; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2998; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
2999; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3000; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3001; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3002; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3003; GFX6-NEXT:    s_endpgm
3004;
3005; GFX8-LABEL: v_clamp_v2f16_undef_elt:
3006; GFX8:       ; %bb.0:
3007; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3008; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3009; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7e00
3010; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3011; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3012; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3013; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3014; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3015; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3016; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3017; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3018; GFX8-NEXT:    s_waitcnt vmcnt(0)
3019; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3020; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
3021; GFX8-NEXT:    v_max_f16_e32 v2, 0, v2
3022; GFX8-NEXT:    v_max_f16_e32 v3, 0x7e00, v3
3023; GFX8-NEXT:    v_min_f16_e32 v3, 1.0, v3
3024; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3025; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3026; GFX8-NEXT:    flat_store_dword v[0:1], v2
3027; GFX8-NEXT:    s_endpgm
3028;
3029; GFX9-LABEL: v_clamp_v2f16_undef_elt:
3030; GFX9:       ; %bb.0:
3031; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3032; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3033; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3035; GFX9-NEXT:    s_waitcnt vmcnt(0)
3036; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3037; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3038; GFX9-NEXT:    s_endpgm
3039;
3040; GFX11-LABEL: v_clamp_v2f16_undef_elt:
3041; GFX11:       ; %bb.0:
3042; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3043; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3044; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3045; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3046; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3047; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3048; GFX11-NEXT:    s_waitcnt vmcnt(0)
3049; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3050; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3051; GFX11-NEXT:    s_endpgm
3052;
3053; GFX12-LABEL: v_clamp_v2f16_undef_elt:
3054; GFX12:       ; %bb.0:
3055; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3056; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3057; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3058; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3059; GFX12-NEXT:    s_wait_kmcnt 0x0
3060; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3061; GFX12-NEXT:    s_wait_loadcnt 0x0
3062; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
3063; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3064; GFX12-NEXT:    s_endpgm
3065  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3066  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3067  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3068  %a = load <2 x half>, ptr addrspace(1) %gep0
3069  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
3070  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
3071
3072  store <2 x half> %med, ptr addrspace(1) %out.gep
3073  ret void
3074}
3075
3076define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3077; GFX6-LABEL: v_clamp_v2f16_not_zero:
3078; GFX6:       ; %bb.0:
3079; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3080; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3081; GFX6-NEXT:    s_mov_b32 s6, 0
3082; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3083; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3084; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3085; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3086; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3087; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3088; GFX6-NEXT:    s_waitcnt vmcnt(0)
3089; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3090; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
3091; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3092; GFX6-NEXT:    v_max_f32_e32 v2, 2.0, v2
3093; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3094; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
3095; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3096; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3097; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3098; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3099; GFX6-NEXT:    s_endpgm
3100;
3101; GFX8-LABEL: v_clamp_v2f16_not_zero:
3102; GFX8:       ; %bb.0:
3103; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3104; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3105; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3106; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3107; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3108; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3109; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3110; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3111; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3112; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3113; GFX8-NEXT:    s_waitcnt vmcnt(0)
3114; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
3115; GFX8-NEXT:    v_max_f16_e32 v2, 2.0, v2
3116; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3117; GFX8-NEXT:    v_min_f16_e32 v2, 1.0, v2
3118; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
3119; GFX8-NEXT:    flat_store_dword v[0:1], v2
3120; GFX8-NEXT:    s_endpgm
3121;
3122; GFX9-LABEL: v_clamp_v2f16_not_zero:
3123; GFX9:       ; %bb.0:
3124; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3125; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3126; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3127; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3128; GFX9-NEXT:    s_waitcnt vmcnt(0)
3129; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
3130; GFX9-NEXT:    v_pk_max_f16 v1, v1, 2.0
3131; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
3132; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3133; GFX9-NEXT:    s_endpgm
3134;
3135; GFX11-LABEL: v_clamp_v2f16_not_zero:
3136; GFX11:       ; %bb.0:
3137; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3138; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3139; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3140; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3141; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3143; GFX11-NEXT:    s_waitcnt vmcnt(0)
3144; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
3145; GFX11-NEXT:    v_pk_max_f16 v1, v1, 2.0
3146; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3147; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
3148; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3149; GFX11-NEXT:    s_endpgm
3150;
3151; GFX12-LABEL: v_clamp_v2f16_not_zero:
3152; GFX12:       ; %bb.0:
3153; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3154; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3155; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3156; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3157; GFX12-NEXT:    s_wait_kmcnt 0x0
3158; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3159; GFX12-NEXT:    s_wait_loadcnt 0x0
3160; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
3161; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, 2.0
3162; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3163; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0]
3164; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3165; GFX12-NEXT:    s_endpgm
3166  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3167  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3168  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3169  %a = load <2 x half>, ptr addrspace(1) %gep0
3170  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
3171  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3172
3173  store <2 x half> %med, ptr addrspace(1) %out.gep
3174  ret void
3175}
3176
3177define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3178; GFX6-LABEL: v_clamp_v2f16_not_one:
3179; GFX6:       ; %bb.0:
3180; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3181; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3182; GFX6-NEXT:    s_mov_b32 s6, 0
3183; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3184; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3185; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3186; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3187; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3188; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3189; GFX6-NEXT:    s_waitcnt vmcnt(0)
3190; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3191; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3192; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
3193; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3194; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 0
3195; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3196; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3197; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3198; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3199; GFX6-NEXT:    s_endpgm
3200;
3201; GFX8-LABEL: v_clamp_v2f16_not_one:
3202; GFX8:       ; %bb.0:
3203; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3204; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3205; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3206; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3207; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3208; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3209; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3210; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3211; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3212; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3213; GFX8-NEXT:    s_waitcnt vmcnt(0)
3214; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
3215; GFX8-NEXT:    v_max_f16_e32 v2, 0, v2
3216; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3217; GFX8-NEXT:    v_min_f16_e32 v2, 0, v2
3218; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
3219; GFX8-NEXT:    flat_store_dword v[0:1], v2
3220; GFX8-NEXT:    s_endpgm
3221;
3222; GFX9-LABEL: v_clamp_v2f16_not_one:
3223; GFX9:       ; %bb.0:
3224; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3225; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3226; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3227; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3228; GFX9-NEXT:    s_waitcnt vmcnt(0)
3229; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
3230; GFX9-NEXT:    v_pk_max_f16 v1, v1, 0
3231; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
3232; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3233; GFX9-NEXT:    s_endpgm
3234;
3235; GFX11-LABEL: v_clamp_v2f16_not_one:
3236; GFX11:       ; %bb.0:
3237; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3238; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3239; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3240; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3241; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3242; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3243; GFX11-NEXT:    s_waitcnt vmcnt(0)
3244; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
3245; GFX11-NEXT:    v_pk_max_f16 v1, v1, 0
3246; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3247; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
3248; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3249; GFX11-NEXT:    s_endpgm
3250;
3251; GFX12-LABEL: v_clamp_v2f16_not_one:
3252; GFX12:       ; %bb.0:
3253; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3254; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3255; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3256; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3257; GFX12-NEXT:    s_wait_kmcnt 0x0
3258; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3259; GFX12-NEXT:    s_wait_loadcnt 0x0
3260; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
3261; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, 0
3262; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3263; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
3264; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3265; GFX12-NEXT:    s_endpgm
3266  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3267  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3268  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3269  %a = load <2 x half>, ptr addrspace(1) %gep0
3270  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
3271  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
3272
3273  store <2 x half> %med, ptr addrspace(1) %out.gep
3274  ret void
3275}
3276
3277define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3278; GFX6-LABEL: v_clamp_neg_v2f16:
3279; GFX6:       ; %bb.0:
3280; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3281; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3282; GFX6-NEXT:    s_mov_b32 s6, 0
3283; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3284; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3285; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3286; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3287; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3288; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3289; GFX6-NEXT:    s_waitcnt vmcnt(0)
3290; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
3291; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3292; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3293; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
3294; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3295; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3296; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3297; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3298; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3299; GFX6-NEXT:    s_endpgm
3300;
3301; GFX8-LABEL: v_clamp_neg_v2f16:
3302; GFX8:       ; %bb.0:
3303; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3304; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3306; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3307; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3308; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3309; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3310; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3311; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3312; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3313; GFX8-NEXT:    s_waitcnt vmcnt(0)
3314; GFX8-NEXT:    v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3315; GFX8-NEXT:    v_max_f16_e64 v3, -v3, -v3 clamp
3316; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3317; GFX8-NEXT:    flat_store_dword v[0:1], v2
3318; GFX8-NEXT:    s_endpgm
3319;
3320; GFX9-LABEL: v_clamp_neg_v2f16:
3321; GFX9:       ; %bb.0:
3322; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3323; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3324; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3325; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3326; GFX9-NEXT:    s_waitcnt vmcnt(0)
3327; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
3328; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3329; GFX9-NEXT:    s_endpgm
3330;
3331; GFX11-LABEL: v_clamp_neg_v2f16:
3332; GFX11:       ; %bb.0:
3333; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3334; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3335; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3336; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3337; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3338; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3339; GFX11-NEXT:    s_waitcnt vmcnt(0)
3340; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
3341; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3342; GFX11-NEXT:    s_endpgm
3343;
3344; GFX12-LABEL: v_clamp_neg_v2f16:
3345; GFX12:       ; %bb.0:
3346; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3347; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3348; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3349; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3350; GFX12-NEXT:    s_wait_kmcnt 0x0
3351; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3352; GFX12-NEXT:    s_wait_loadcnt 0x0
3353; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
3354; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3355; GFX12-NEXT:    s_endpgm
3356  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3357  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3358  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3359  %a = load <2 x half>, ptr addrspace(1) %gep0
3360  %fneg.a = fneg <2 x half> %a
3361  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
3362  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3363
3364  store <2 x half> %med, ptr addrspace(1) %out.gep
3365  ret void
3366}
3367
3368define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3369; GFX6-LABEL: v_clamp_negabs_v2f16:
3370; GFX6:       ; %bb.0:
3371; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3372; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3373; GFX6-NEXT:    s_mov_b32 s6, 0
3374; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3375; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3376; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3377; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3378; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3379; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3380; GFX6-NEXT:    s_waitcnt vmcnt(0)
3381; GFX6-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
3382; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3383; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3384; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
3385; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3386; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3387; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3388; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3389; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3390; GFX6-NEXT:    s_endpgm
3391;
3392; GFX8-LABEL: v_clamp_negabs_v2f16:
3393; GFX8:       ; %bb.0:
3394; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3395; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3397; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3398; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3399; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3400; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3401; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3402; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3403; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3404; GFX8-NEXT:    s_waitcnt vmcnt(0)
3405; GFX8-NEXT:    v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3406; GFX8-NEXT:    v_max_f16_e64 v3, -|v3|, -|v3| clamp
3407; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3408; GFX8-NEXT:    flat_store_dword v[0:1], v2
3409; GFX8-NEXT:    s_endpgm
3410;
3411; GFX9-LABEL: v_clamp_negabs_v2f16:
3412; GFX9:       ; %bb.0:
3413; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3414; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3415; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3416; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3417; GFX9-NEXT:    s_waitcnt vmcnt(0)
3418; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
3419; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
3420; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3421; GFX9-NEXT:    s_endpgm
3422;
3423; GFX11-LABEL: v_clamp_negabs_v2f16:
3424; GFX11:       ; %bb.0:
3425; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3426; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3427; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3428; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3429; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3430; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3431; GFX11-NEXT:    s_waitcnt vmcnt(0)
3432; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
3433; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
3434; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3435; GFX11-NEXT:    s_endpgm
3436;
3437; GFX12-LABEL: v_clamp_negabs_v2f16:
3438; GFX12:       ; %bb.0:
3439; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3440; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3441; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3442; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3443; GFX12-NEXT:    s_wait_kmcnt 0x0
3444; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3445; GFX12-NEXT:    s_wait_loadcnt 0x0
3446; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
3447; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
3448; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3449; GFX12-NEXT:    s_endpgm
3450  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3451  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3452  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3453  %a = load <2 x half>, ptr addrspace(1) %gep0
3454  %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
3455  %fneg.fabs.a = fneg <2 x half> %fabs.a
3456
3457  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
3458  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3459
3460  store <2 x half> %med, ptr addrspace(1) %out.gep
3461  ret void
3462}
3463
3464define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3465; GFX6-LABEL: v_clamp_neglo_v2f16:
3466; GFX6:       ; %bb.0:
3467; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3468; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3469; GFX6-NEXT:    s_mov_b32 s6, 0
3470; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3471; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3472; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3473; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3474; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3475; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3476; GFX6-NEXT:    s_waitcnt vmcnt(0)
3477; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3478; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3479; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3480; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -v2 clamp
3481; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3482; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3483; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3484; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3485; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3486; GFX6-NEXT:    s_endpgm
3487;
3488; GFX8-LABEL: v_clamp_neglo_v2f16:
3489; GFX8:       ; %bb.0:
3490; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3491; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3492; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3493; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3494; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3495; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3496; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3497; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3498; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3499; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3500; GFX8-NEXT:    s_waitcnt vmcnt(0)
3501; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3502; GFX8-NEXT:    v_max_f16_e64 v3, -v3, -v3 clamp
3503; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3504; GFX8-NEXT:    flat_store_dword v[0:1], v2
3505; GFX8-NEXT:    s_endpgm
3506;
3507; GFX9-LABEL: v_clamp_neglo_v2f16:
3508; GFX9:       ; %bb.0:
3509; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3510; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3511; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3512; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3513; GFX9-NEXT:    s_waitcnt vmcnt(0)
3514; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
3515; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3516; GFX9-NEXT:    s_endpgm
3517;
3518; GFX11-LABEL: v_clamp_neglo_v2f16:
3519; GFX11:       ; %bb.0:
3520; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3521; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3522; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3523; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3524; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3525; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3526; GFX11-NEXT:    s_waitcnt vmcnt(0)
3527; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
3528; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3529; GFX11-NEXT:    s_endpgm
3530;
3531; GFX12-LABEL: v_clamp_neglo_v2f16:
3532; GFX12:       ; %bb.0:
3533; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3534; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3535; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3536; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3537; GFX12-NEXT:    s_wait_kmcnt 0x0
3538; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3539; GFX12-NEXT:    s_wait_loadcnt 0x0
3540; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp
3541; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3542; GFX12-NEXT:    s_endpgm
3543  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3544  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3545  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3546  %a = load <2 x half>, ptr addrspace(1) %gep0
3547  %lo = extractelement <2 x half> %a, i32 0
3548  %neg.lo = fneg half %lo
3549  %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
3550  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
3551  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3552
3553  store <2 x half> %med, ptr addrspace(1) %out.gep
3554  ret void
3555}
3556
3557define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3558; GFX6-LABEL: v_clamp_neghi_v2f16:
3559; GFX6:       ; %bb.0:
3560; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3561; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3562; GFX6-NEXT:    s_mov_b32 s6, 0
3563; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3564; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3565; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3566; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3567; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3568; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3569; GFX6-NEXT:    s_waitcnt vmcnt(0)
3570; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3571; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, -v3 clamp
3572; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
3573; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3574; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3575; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3576; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3577; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3578; GFX6-NEXT:    s_endpgm
3579;
3580; GFX8-LABEL: v_clamp_neghi_v2f16:
3581; GFX8:       ; %bb.0:
3582; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3583; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3584; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3585; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3586; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3587; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3588; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3589; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3590; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3591; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3592; GFX8-NEXT:    s_waitcnt vmcnt(0)
3593; GFX8-NEXT:    v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3594; GFX8-NEXT:    v_max_f16_e64 v3, v3, v3 clamp
3595; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3596; GFX8-NEXT:    flat_store_dword v[0:1], v2
3597; GFX8-NEXT:    s_endpgm
3598;
3599; GFX9-LABEL: v_clamp_neghi_v2f16:
3600; GFX9:       ; %bb.0:
3601; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3602; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3603; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3604; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3605; GFX9-NEXT:    s_waitcnt vmcnt(0)
3606; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
3607; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3608; GFX9-NEXT:    s_endpgm
3609;
3610; GFX11-LABEL: v_clamp_neghi_v2f16:
3611; GFX11:       ; %bb.0:
3612; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3613; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3614; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3615; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3616; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3617; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3618; GFX11-NEXT:    s_waitcnt vmcnt(0)
3619; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
3620; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3621; GFX11-NEXT:    s_endpgm
3622;
3623; GFX12-LABEL: v_clamp_neghi_v2f16:
3624; GFX12:       ; %bb.0:
3625; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3626; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3627; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3628; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3629; GFX12-NEXT:    s_wait_kmcnt 0x0
3630; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3631; GFX12-NEXT:    s_wait_loadcnt 0x0
3632; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp
3633; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3634; GFX12-NEXT:    s_endpgm
3635  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3636  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3637  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3638  %a = load <2 x half>, ptr addrspace(1) %gep0
3639  %hi = extractelement <2 x half> %a, i32 1
3640  %neg.hi = fneg half %hi
3641  %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
3642  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
3643  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3644
3645  store <2 x half> %med, ptr addrspace(1) %out.gep
3646  ret void
3647}
3648
3649define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3650; GFX6-LABEL: v_clamp_v2f16_shuffle:
3651; GFX6:       ; %bb.0:
3652; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3653; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3654; GFX6-NEXT:    s_mov_b32 s6, 0
3655; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3656; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3657; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3658; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3659; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3660; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3661; GFX6-NEXT:    s_waitcnt vmcnt(0)
3662; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3663; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
3664; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3665; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3666; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3667; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3668; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
3669; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3670; GFX6-NEXT:    s_endpgm
3671;
3672; GFX8-LABEL: v_clamp_v2f16_shuffle:
3673; GFX8:       ; %bb.0:
3674; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3675; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3676; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3677; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3678; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3679; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3680; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3681; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3682; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3683; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3684; GFX8-NEXT:    s_waitcnt vmcnt(0)
3685; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3686; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3687; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3688; GFX8-NEXT:    flat_store_dword v[0:1], v2
3689; GFX8-NEXT:    s_endpgm
3690;
3691; GFX9-LABEL: v_clamp_v2f16_shuffle:
3692; GFX9:       ; %bb.0:
3693; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3694; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3695; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3696; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3697; GFX9-NEXT:    s_waitcnt vmcnt(0)
3698; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3699; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3700; GFX9-NEXT:    s_endpgm
3701;
3702; GFX11-LABEL: v_clamp_v2f16_shuffle:
3703; GFX11:       ; %bb.0:
3704; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3705; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3706; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3707; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3708; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3709; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3710; GFX11-NEXT:    s_waitcnt vmcnt(0)
3711; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3712; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3713; GFX11-NEXT:    s_endpgm
3714;
3715; GFX12-LABEL: v_clamp_v2f16_shuffle:
3716; GFX12:       ; %bb.0:
3717; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3718; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3719; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3720; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3721; GFX12-NEXT:    s_wait_kmcnt 0x0
3722; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3723; GFX12-NEXT:    s_wait_loadcnt 0x0
3724; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3725; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3726; GFX12-NEXT:    s_endpgm
3727  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3728  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3729  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3730  %a = load <2 x half>, ptr addrspace(1) %gep0
3731  %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
3732  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
3733  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3734
3735  store <2 x half> %med, ptr addrspace(1) %out.gep
3736  ret void
3737}
3738
3739define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3740; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0:
3741; GFX6:       ; %bb.0:
3742; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3743; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3744; GFX6-NEXT:    s_mov_b32 s6, 0
3745; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3746; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3747; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3748; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3749; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3750; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
3751; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3752; GFX6-NEXT:    s_waitcnt vmcnt(0)
3753; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3754; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
3755; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
3756; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
3757; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
3758; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3759; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
3760; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3761; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3762; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3763; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3764; GFX6-NEXT:    s_endpgm
3765;
3766; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0:
3767; GFX8:       ; %bb.0:
3768; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3769; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3770; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
3771; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3772; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3773; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3774; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3775; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3776; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3777; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3778; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3779; GFX8-NEXT:    s_waitcnt vmcnt(0)
3780; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3781; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
3782; GFX8-NEXT:    v_max_f16_e32 v2, 0x7e00, v2
3783; GFX8-NEXT:    v_max_f16_e32 v3, 0, v3
3784; GFX8-NEXT:    v_min_f16_e32 v3, 0x7e00, v3
3785; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3786; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3787; GFX8-NEXT:    flat_store_dword v[0:1], v2
3788; GFX8-NEXT:    s_endpgm
3789;
3790; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
3791; GFX9:       ; %bb.0:
3792; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3793; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3794; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3795; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3796; GFX9-NEXT:    s_waitcnt vmcnt(0)
3797; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3798; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3799; GFX9-NEXT:    s_endpgm
3800;
3801; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
3802; GFX11:       ; %bb.0:
3803; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3804; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3806; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3807; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3808; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3809; GFX11-NEXT:    s_waitcnt vmcnt(0)
3810; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3811; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3812; GFX11-NEXT:    s_endpgm
3813;
3814; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0:
3815; GFX12:       ; %bb.0:
3816; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3817; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3818; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3819; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3820; GFX12-NEXT:    s_wait_kmcnt 0x0
3821; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3822; GFX12-NEXT:    s_wait_loadcnt 0x0
3823; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
3824; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3825; GFX12-NEXT:    s_endpgm
3826  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3827  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3828  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3829  %a = load <2 x half>, ptr addrspace(1) %gep0
3830  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
3831  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
3832
3833  store <2 x half> %med, ptr addrspace(1) %out.gep
3834  ret void
3835}
3836
3837define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3838; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1:
3839; GFX6:       ; %bb.0:
3840; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3841; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3842; GFX6-NEXT:    s_mov_b32 s6, 0
3843; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3844; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3846; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3847; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3848; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
3849; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3850; GFX6-NEXT:    s_waitcnt vmcnt(0)
3851; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3852; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
3853; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
3854; GFX6-NEXT:    v_max_f32_e32 v2, 0x7fc00000, v2
3855; GFX6-NEXT:    v_med3_f32 v3, v3, 0, v4
3856; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3857; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
3858; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3859; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3860; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3861; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3862; GFX6-NEXT:    s_endpgm
3863;
3864; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1:
3865; GFX8:       ; %bb.0:
3866; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3867; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3868; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7e00
3869; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3870; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3871; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3872; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3873; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3874; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3875; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3876; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3877; GFX8-NEXT:    s_waitcnt vmcnt(0)
3878; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3879; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
3880; GFX8-NEXT:    v_max_f16_e32 v2, 0, v2
3881; GFX8-NEXT:    v_max_f16_e32 v3, 0x7e00, v3
3882; GFX8-NEXT:    v_min_f16_e32 v3, 1.0, v3
3883; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3884; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3885; GFX8-NEXT:    flat_store_dword v[0:1], v2
3886; GFX8-NEXT:    s_endpgm
3887;
3888; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
3889; GFX9:       ; %bb.0:
3890; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3891; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3892; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3893; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3894; GFX9-NEXT:    s_waitcnt vmcnt(0)
3895; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3896; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3897; GFX9-NEXT:    s_endpgm
3898;
3899; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
3900; GFX11:       ; %bb.0:
3901; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3902; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3903; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3904; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3905; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3906; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3907; GFX11-NEXT:    s_waitcnt vmcnt(0)
3908; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3909; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3910; GFX11-NEXT:    s_endpgm
3911;
3912; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1:
3913; GFX12:       ; %bb.0:
3914; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3915; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3916; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3917; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3918; GFX12-NEXT:    s_wait_kmcnt 0x0
3919; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
3920; GFX12-NEXT:    s_wait_loadcnt 0x0
3921; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
3922; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
3923; GFX12-NEXT:    s_endpgm
3924  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3925  %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3926  %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3927  %a = load <2 x half>, ptr addrspace(1) %gep0
3928  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
3929  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
3930
3931  store <2 x half> %med, ptr addrspace(1) %out.gep
3932  ret void
3933}
3934
3935define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0
3936; GFX6-LABEL: v_clamp_diff_source_f32:
3937; GFX6:       ; %bb.0:
3938; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3939; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3940; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
3941; GFX6-NEXT:    s_load_dword s2, s[2:3], 0x2
3942; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3943; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3944; GFX6-NEXT:    v_mov_b32_e32 v0, s5
3945; GFX6-NEXT:    v_mov_b32_e32 v1, s2
3946; GFX6-NEXT:    v_add_f32_e32 v0, s4, v0
3947; GFX6-NEXT:    v_add_f32_e32 v1, s4, v1
3948; GFX6-NEXT:    v_max_f32_e64 v0, v0, v1 clamp
3949; GFX6-NEXT:    s_mov_b32 s2, -1
3950; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
3951; GFX6-NEXT:    s_endpgm
3952;
3953; GFX8-LABEL: v_clamp_diff_source_f32:
3954; GFX8:       ; %bb.0:
3955; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3956; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3957; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
3958; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x8
3959; GFX8-NEXT:    s_add_u32 s0, s0, 12
3960; GFX8-NEXT:    s_addc_u32 s1, s1, 0
3961; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3962; GFX8-NEXT:    v_mov_b32_e32 v0, s5
3963; GFX8-NEXT:    v_mov_b32_e32 v1, s2
3964; GFX8-NEXT:    v_add_f32_e32 v0, s4, v0
3965; GFX8-NEXT:    v_add_f32_e32 v1, s4, v1
3966; GFX8-NEXT:    v_max_f32_e64 v2, v0, v1 clamp
3967; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3968; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3969; GFX8-NEXT:    flat_store_dword v[0:1], v2
3970; GFX8-NEXT:    s_endpgm
3971;
3972; GFX9-LABEL: v_clamp_diff_source_f32:
3973; GFX9:       ; %bb.0:
3974; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3975; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3976; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3977; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
3978; GFX9-NEXT:    s_load_dword s6, s[2:3], 0x8
3979; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3980; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3981; GFX9-NEXT:    v_mov_b32_e32 v2, s6
3982; GFX9-NEXT:    v_add_f32_e32 v1, s4, v1
3983; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
3984; GFX9-NEXT:    v_max_f32_e64 v1, v1, v2 clamp
3985; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:12
3986; GFX9-NEXT:    s_endpgm
3987;
3988; GFX11-LABEL: v_clamp_diff_source_f32:
3989; GFX11:       ; %bb.0:
3990; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3991; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3992; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3993; GFX11-NEXT:    s_clause 0x1
3994; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
3995; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x8
3996; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3997; GFX11-NEXT:    v_add_f32_e64 v0, s4, s5
3998; GFX11-NEXT:    v_add_f32_e64 v1, s4, s2
3999; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4000; GFX11-NEXT:    v_max_f32_e64 v0, v0, v1 clamp
4001; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1] offset:12
4002; GFX11-NEXT:    s_endpgm
4003;
4004; GFX12-LABEL: v_clamp_diff_source_f32:
4005; GFX12:       ; %bb.0:
4006; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4007; GFX12-NEXT:    v_mov_b32_e32 v0, 0
4008; GFX12-NEXT:    s_wait_kmcnt 0x0
4009; GFX12-NEXT:    s_load_b96 s[4:6], s[2:3], 0x0
4010; GFX12-NEXT:    s_wait_kmcnt 0x0
4011; GFX12-NEXT:    s_add_f32 s2, s4, s5
4012; GFX12-NEXT:    s_add_f32 s3, s4, s6
4013; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
4014; GFX12-NEXT:    s_max_num_f32 s2, s2, s3
4015; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2 clamp
4016; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] offset:12
4017; GFX12-NEXT:    s_endpgm
4018{
4019  %gep1 = getelementptr float, ptr addrspace(1) %aptr, i32 1
4020  %gep2 = getelementptr float, ptr addrspace(1) %aptr, i32 2
4021  %l0 = load float, ptr addrspace(1) %aptr
4022  %l1 = load float, ptr addrspace(1) %gep1
4023  %l2 = load float, ptr addrspace(1) %gep2
4024  %a = fadd nsz float %l0, %l1
4025  %b = fadd nsz float %l0, %l2
4026  %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
4027  %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
4028  %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
4029  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 3
4030  store float %min, ptr addrspace(1) %out.gep
4031  ret void
4032}
4033
4034declare i32 @llvm.amdgcn.workitem.id.x() #1
4035declare float @llvm.fabs.f32(float) #1
4036declare float @llvm.minnum.f32(float, float) #1
4037declare float @llvm.maxnum.f32(float, float) #1
4038declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
4039declare double @llvm.fabs.f64(double) #1
4040declare double @llvm.minnum.f64(double, double) #1
4041declare double @llvm.maxnum.f64(double, double) #1
4042declare half @llvm.fabs.f16(half) #1
4043declare half @llvm.minnum.f16(half, half) #1
4044declare half @llvm.maxnum.f16(half, half) #1
4045declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
4046declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
4047declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
4048
4049attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
4050attributes #1 = { nounwind readnone }
4051attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
4052attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
4053attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
4054