1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 7 8define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 9; GFX6-LABEL: v_clamp_f32: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; GFX6-NEXT: s_mov_b32 s7, 0xf000 13; GFX6-NEXT: s_mov_b32 s6, 0 14; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 15; GFX6-NEXT: v_mov_b32_e32 v1, 0 16; GFX6-NEXT: s_waitcnt lgkmcnt(0) 17; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 18; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 19; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 20; GFX6-NEXT: s_waitcnt vmcnt(0) 21; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 22; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 23; GFX6-NEXT: s_endpgm 24; 25; GFX8-LABEL: v_clamp_f32: 26; GFX8: ; %bb.0: 27; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 28; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 29; GFX8-NEXT: s_waitcnt lgkmcnt(0) 30; GFX8-NEXT: v_mov_b32_e32 v1, s3 31; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 32; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 33; GFX8-NEXT: flat_load_dword v3, v[0:1] 34; GFX8-NEXT: v_mov_b32_e32 v1, s1 35; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 36; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 37; GFX8-NEXT: s_waitcnt vmcnt(0) 38; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 39; GFX8-NEXT: flat_store_dword v[0:1], v2 40; GFX8-NEXT: s_endpgm 41; 42; GFX9-LABEL: v_clamp_f32: 43; GFX9: ; %bb.0: 44; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 45; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 46; GFX9-NEXT: s_waitcnt lgkmcnt(0) 47; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 48; GFX9-NEXT: s_waitcnt vmcnt(0) 49; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 50; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 51; GFX9-NEXT: s_endpgm 52; 53; GFX11-LABEL: v_clamp_f32: 54; GFX11: ; %bb.0: 55; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 56; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 57; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 58; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 59; GFX11-NEXT: s_waitcnt lgkmcnt(0) 60; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 61; GFX11-NEXT: s_waitcnt vmcnt(0) 62; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 63; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 64; GFX11-NEXT: s_endpgm 65; 66; GFX12-LABEL: v_clamp_f32: 67; GFX12: ; %bb.0: 68; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 69; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 70; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 71; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 72; GFX12-NEXT: s_wait_kmcnt 0x0 73; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 74; GFX12-NEXT: s_wait_loadcnt 0x0 75; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 76; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 77; GFX12-NEXT: s_endpgm 78 %tid = call i32 @llvm.amdgcn.workitem.id.x() 79 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 80 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 81 %a = load float, ptr addrspace(1) %gep0 82 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 83 %med = call float @llvm.minnum.f32(float %max, float 1.0) 84 85 store float %med, ptr addrspace(1) %out.gep 86 ret void 87} 88 89define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 90; GFX6-LABEL: v_clamp_neg_f32: 91; GFX6: ; %bb.0: 92; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 93; GFX6-NEXT: s_mov_b32 s7, 0xf000 94; GFX6-NEXT: s_mov_b32 s6, 0 95; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 96; GFX6-NEXT: v_mov_b32_e32 v1, 0 97; GFX6-NEXT: s_waitcnt lgkmcnt(0) 98; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 99; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 100; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 101; GFX6-NEXT: s_waitcnt vmcnt(0) 102; GFX6-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp 103; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 104; GFX6-NEXT: s_endpgm 105; 106; GFX8-LABEL: v_clamp_neg_f32: 107; GFX8: ; %bb.0: 108; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 109; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 110; GFX8-NEXT: s_waitcnt lgkmcnt(0) 111; GFX8-NEXT: v_mov_b32_e32 v1, s3 112; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 113; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 114; GFX8-NEXT: flat_load_dword v3, v[0:1] 115; GFX8-NEXT: v_mov_b32_e32 v1, s1 116; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 117; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 118; GFX8-NEXT: s_waitcnt vmcnt(0) 119; GFX8-NEXT: v_max_f32_e64 v2, -v3, -v3 clamp 120; GFX8-NEXT: flat_store_dword v[0:1], v2 121; GFX8-NEXT: s_endpgm 122; 123; GFX9-LABEL: v_clamp_neg_f32: 124; GFX9: ; %bb.0: 125; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 126; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 128; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 129; GFX9-NEXT: s_waitcnt vmcnt(0) 130; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp 131; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 132; GFX9-NEXT: s_endpgm 133; 134; GFX11-LABEL: v_clamp_neg_f32: 135; GFX11: ; %bb.0: 136; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 137; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 139; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 140; GFX11-NEXT: s_waitcnt lgkmcnt(0) 141; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 142; GFX11-NEXT: s_waitcnt vmcnt(0) 143; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp 144; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 145; GFX11-NEXT: s_endpgm 146; 147; GFX12-LABEL: v_clamp_neg_f32: 148; GFX12: ; %bb.0: 149; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 150; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 151; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 152; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 153; GFX12-NEXT: s_wait_kmcnt 0x0 154; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 155; GFX12-NEXT: s_wait_loadcnt 0x0 156; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 clamp 157; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 158; GFX12-NEXT: s_endpgm 159 %tid = call i32 @llvm.amdgcn.workitem.id.x() 160 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 161 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 162 %a = load float, ptr addrspace(1) %gep0 163 %fneg.a = fneg float %a 164 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) 165 %med = call float @llvm.minnum.f32(float %max, float 1.0) 166 167 store float %med, ptr addrspace(1) %out.gep 168 ret void 169} 170 171define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 172; GFX6-LABEL: v_clamp_negabs_f32: 173; GFX6: ; %bb.0: 174; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 175; GFX6-NEXT: s_mov_b32 s7, 0xf000 176; GFX6-NEXT: s_mov_b32 s6, 0 177; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 178; GFX6-NEXT: v_mov_b32_e32 v1, 0 179; GFX6-NEXT: s_waitcnt lgkmcnt(0) 180; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 181; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 182; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 183; GFX6-NEXT: s_waitcnt vmcnt(0) 184; GFX6-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| clamp 185; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 186; GFX6-NEXT: s_endpgm 187; 188; GFX8-LABEL: v_clamp_negabs_f32: 189; GFX8: ; %bb.0: 190; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 191; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 192; GFX8-NEXT: s_waitcnt lgkmcnt(0) 193; GFX8-NEXT: v_mov_b32_e32 v1, s3 194; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 195; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 196; GFX8-NEXT: flat_load_dword v3, v[0:1] 197; GFX8-NEXT: v_mov_b32_e32 v1, s1 198; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 199; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 200; GFX8-NEXT: s_waitcnt vmcnt(0) 201; GFX8-NEXT: v_max_f32_e64 v2, -|v3|, -|v3| clamp 202; GFX8-NEXT: flat_store_dword v[0:1], v2 203; GFX8-NEXT: s_endpgm 204; 205; GFX9-LABEL: v_clamp_negabs_f32: 206; GFX9: ; %bb.0: 207; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 208; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 209; GFX9-NEXT: s_waitcnt lgkmcnt(0) 210; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp 213; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 214; GFX9-NEXT: s_endpgm 215; 216; GFX11-LABEL: v_clamp_negabs_f32: 217; GFX11: ; %bb.0: 218; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 219; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 220; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 221; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 222; GFX11-NEXT: s_waitcnt lgkmcnt(0) 223; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 224; GFX11-NEXT: s_waitcnt vmcnt(0) 225; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp 226; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 227; GFX11-NEXT: s_endpgm 228; 229; GFX12-LABEL: v_clamp_negabs_f32: 230; GFX12: ; %bb.0: 231; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 232; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 233; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 234; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 235; GFX12-NEXT: s_wait_kmcnt 0x0 236; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 237; GFX12-NEXT: s_wait_loadcnt 0x0 238; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| clamp 239; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 240; GFX12-NEXT: s_endpgm 241 %tid = call i32 @llvm.amdgcn.workitem.id.x() 242 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 243 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 244 %a = load float, ptr addrspace(1) %gep0 245 %fabs.a = call float @llvm.fabs.f32(float %a) 246 %fneg.fabs.a = fneg float %fabs.a 247 248 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0) 249 %med = call float @llvm.minnum.f32(float %max, float 1.0) 250 251 store float %med, ptr addrspace(1) %out.gep 252 ret void 253} 254 255define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 256; GFX6-LABEL: v_clamp_negzero_f32: 257; GFX6: ; %bb.0: 258; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 259; GFX6-NEXT: s_mov_b32 s7, 0xf000 260; GFX6-NEXT: s_mov_b32 s6, 0 261; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 262; GFX6-NEXT: v_mov_b32_e32 v1, 0 263; GFX6-NEXT: s_waitcnt lgkmcnt(0) 264; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 265; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 266; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 267; GFX6-NEXT: s_waitcnt vmcnt(0) 268; GFX6-NEXT: v_add_f32_e32 v2, 0.5, v2 269; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2 270; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 271; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 272; GFX6-NEXT: s_endpgm 273; 274; GFX8-LABEL: v_clamp_negzero_f32: 275; GFX8: ; %bb.0: 276; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 277; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 278; GFX8-NEXT: s_waitcnt lgkmcnt(0) 279; GFX8-NEXT: v_mov_b32_e32 v1, s3 280; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 281; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 282; GFX8-NEXT: flat_load_dword v3, v[0:1] 283; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 284; GFX8-NEXT: v_mov_b32_e32 v1, s1 285; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 286; GFX8-NEXT: s_waitcnt vmcnt(0) 287; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3 288; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2 289; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2 290; GFX8-NEXT: flat_store_dword v[0:1], v2 291; GFX8-NEXT: s_endpgm 292; 293; GFX9-LABEL: v_clamp_negzero_f32: 294; GFX9: ; %bb.0: 295; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 296; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 297; GFX9-NEXT: s_waitcnt lgkmcnt(0) 298; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 299; GFX9-NEXT: s_waitcnt vmcnt(0) 300; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 301; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 302; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 303; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 304; GFX9-NEXT: s_endpgm 305; 306; GFX11-LABEL: v_clamp_negzero_f32: 307; GFX11: ; %bb.0: 308; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 309; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 310; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 311; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 312; GFX11-NEXT: s_waitcnt lgkmcnt(0) 313; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 314; GFX11-NEXT: s_waitcnt vmcnt(0) 315; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 316; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 317; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 318; GFX11-NEXT: s_endpgm 319; 320; GFX12-LABEL: v_clamp_negzero_f32: 321; GFX12: ; %bb.0: 322; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 323; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 324; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 325; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 326; GFX12-NEXT: s_wait_kmcnt 0x0 327; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 328; GFX12-NEXT: s_wait_loadcnt 0x0 329; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1 330; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 331; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 332; GFX12-NEXT: s_endpgm 333 %tid = call i32 @llvm.amdgcn.workitem.id.x() 334 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 335 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 336 %a = load float, ptr addrspace(1) %gep0 337 %add = fadd nnan float %a, 0.5 338 %max = call float @llvm.maxnum.f32(float %add, float -0.0) 339 %med = call float @llvm.minnum.f32(float %max, float 1.0) 340 341 store float %med, ptr addrspace(1) %out.gep 342 ret void 343} 344 345; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp 346; matched through med3, not if directly. Is this correct? 347define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 348; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32: 349; GFX6: ; %bb.0: 350; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 351; GFX6-NEXT: s_mov_b32 s7, 0xf000 352; GFX6-NEXT: s_mov_b32 s6, 0 353; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 354; GFX6-NEXT: v_mov_b32_e32 v1, 0 355; GFX6-NEXT: s_waitcnt lgkmcnt(0) 356; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 357; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 358; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 359; GFX6-NEXT: s_waitcnt vmcnt(0) 360; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 361; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2 362; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 363; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 364; GFX6-NEXT: s_endpgm 365; 366; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32: 367; GFX8: ; %bb.0: 368; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 369; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 370; GFX8-NEXT: s_waitcnt lgkmcnt(0) 371; GFX8-NEXT: v_mov_b32_e32 v1, s3 372; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 373; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 374; GFX8-NEXT: flat_load_dword v3, v[0:1] 375; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 376; GFX8-NEXT: v_mov_b32_e32 v1, s1 377; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 378; GFX8-NEXT: s_waitcnt vmcnt(0) 379; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 380; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2 381; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2 382; GFX8-NEXT: flat_store_dword v[0:1], v2 383; GFX8-NEXT: s_endpgm 384; 385; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: 386; GFX9: ; %bb.0: 387; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 388; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 389; GFX9-NEXT: s_waitcnt lgkmcnt(0) 390; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 391; GFX9-NEXT: s_waitcnt vmcnt(0) 392; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 393; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 394; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 395; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 396; GFX9-NEXT: s_endpgm 397; 398; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: 399; GFX11: ; %bb.0: 400; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 401; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 402; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 403; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 404; GFX11-NEXT: s_waitcnt lgkmcnt(0) 405; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 406; GFX11-NEXT: s_waitcnt vmcnt(0) 407; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 408; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 409; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 410; GFX11-NEXT: s_endpgm 411; 412; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32: 413; GFX12: ; %bb.0: 414; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 415; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 416; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 417; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 418; GFX12-NEXT: s_wait_kmcnt 0x0 419; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 420; GFX12-NEXT: s_wait_loadcnt 0x0 421; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 422; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 423; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 424; GFX12-NEXT: s_endpgm 425 %tid = call i32 @llvm.amdgcn.workitem.id.x() 426 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 427 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 428 %a = load float, ptr addrspace(1) %gep0 429 %max = call float @llvm.maxnum.f32(float %a, float -0.0) 430 %med = call float @llvm.minnum.f32(float %max, float 1.0) 431 432 store float %med, ptr addrspace(1) %out.gep 433 ret void 434} 435 436define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 437; GFX6-LABEL: v_clamp_multi_use_max_f32: 438; GFX6: ; %bb.0: 439; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 440; GFX6-NEXT: s_mov_b32 s6, 0 441; GFX6-NEXT: s_mov_b32 s7, 0xf000 442; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 443; GFX6-NEXT: v_mov_b32_e32 v1, 0 444; GFX6-NEXT: s_waitcnt lgkmcnt(0) 445; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 446; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 447; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 448; GFX6-NEXT: s_mov_b32 s6, -1 449; GFX6-NEXT: s_waitcnt vmcnt(0) 450; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 451; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 452; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v2 453; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 454; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0 455; GFX6-NEXT: s_waitcnt vmcnt(0) 456; GFX6-NEXT: s_endpgm 457; 458; GFX8-LABEL: v_clamp_multi_use_max_f32: 459; GFX8: ; %bb.0: 460; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 461; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 462; GFX8-NEXT: s_waitcnt lgkmcnt(0) 463; GFX8-NEXT: v_mov_b32_e32 v1, s3 464; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 465; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 466; GFX8-NEXT: flat_load_dword v3, v[0:1] 467; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 468; GFX8-NEXT: v_mov_b32_e32 v1, s1 469; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 470; GFX8-NEXT: s_waitcnt vmcnt(0) 471; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 472; GFX8-NEXT: v_max_f32_e32 v2, 0, v2 473; GFX8-NEXT: v_min_f32_e32 v3, 1.0, v2 474; GFX8-NEXT: flat_store_dword v[0:1], v3 475; GFX8-NEXT: flat_store_dword v[0:1], v2 476; GFX8-NEXT: s_waitcnt vmcnt(0) 477; GFX8-NEXT: s_endpgm 478; 479; GFX9-LABEL: v_clamp_multi_use_max_f32: 480; GFX9: ; %bb.0: 481; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 482; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 484; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 485; GFX9-NEXT: s_waitcnt vmcnt(0) 486; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 487; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 488; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1 489; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 490; GFX9-NEXT: global_store_dword v[0:1], v1, off 491; GFX9-NEXT: s_waitcnt vmcnt(0) 492; GFX9-NEXT: s_endpgm 493; 494; GFX11-LABEL: v_clamp_multi_use_max_f32: 495; GFX11: ; %bb.0: 496; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 497; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 498; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 499; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 500; GFX11-NEXT: s_waitcnt lgkmcnt(0) 501; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 502; GFX11-NEXT: s_waitcnt vmcnt(0) 503; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 504; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 505; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 506; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 507; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] 508; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc 509; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 510; GFX11-NEXT: s_endpgm 511; 512; GFX12-LABEL: v_clamp_multi_use_max_f32: 513; GFX12: ; %bb.0: 514; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 515; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 516; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 517; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 518; GFX12-NEXT: s_wait_kmcnt 0x0 519; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 520; GFX12-NEXT: s_wait_loadcnt 0x0 521; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 522; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 523; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 524; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 525; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] 526; GFX12-NEXT: s_wait_storecnt 0x0 527; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS 528; GFX12-NEXT: s_wait_storecnt 0x0 529; GFX12-NEXT: s_endpgm 530 %tid = call i32 @llvm.amdgcn.workitem.id.x() 531 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 532 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 533 %a = load float, ptr addrspace(1) %gep0 534 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 535 %med = call float @llvm.minnum.f32(float %max, float 1.0) 536 537 store float %med, ptr addrspace(1) %out.gep 538 store volatile float %max, ptr addrspace(1) undef 539 ret void 540} 541 542define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 543; GFX6-LABEL: v_clamp_f16: 544; GFX6: ; %bb.0: 545; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 546; GFX6-NEXT: s_mov_b32 s7, 0xf000 547; GFX6-NEXT: s_mov_b32 s6, 0 548; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 549; GFX6-NEXT: v_mov_b32_e32 v1, 0 550; GFX6-NEXT: s_waitcnt lgkmcnt(0) 551; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 552; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 553; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 554; GFX6-NEXT: s_waitcnt vmcnt(0) 555; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 556; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 557; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 558; GFX6-NEXT: s_endpgm 559; 560; GFX8-LABEL: v_clamp_f16: 561; GFX8: ; %bb.0: 562; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 563; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 564; GFX8-NEXT: s_waitcnt lgkmcnt(0) 565; GFX8-NEXT: v_mov_b32_e32 v1, s3 566; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 567; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 568; GFX8-NEXT: flat_load_ushort v3, v[0:1] 569; GFX8-NEXT: v_mov_b32_e32 v1, s1 570; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 571; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 572; GFX8-NEXT: s_waitcnt vmcnt(0) 573; GFX8-NEXT: v_max_f16_e64 v2, v3, v3 clamp 574; GFX8-NEXT: flat_store_short v[0:1], v2 575; GFX8-NEXT: s_endpgm 576; 577; GFX9-LABEL: v_clamp_f16: 578; GFX9: ; %bb.0: 579; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 580; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 581; GFX9-NEXT: s_waitcnt lgkmcnt(0) 582; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 583; GFX9-NEXT: s_waitcnt vmcnt(0) 584; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp 585; GFX9-NEXT: global_store_short v0, v1, s[0:1] 586; GFX9-NEXT: s_endpgm 587; 588; GFX11-LABEL: v_clamp_f16: 589; GFX11: ; %bb.0: 590; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 591; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 592; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 593; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 594; GFX11-NEXT: s_waitcnt lgkmcnt(0) 595; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 596; GFX11-NEXT: s_waitcnt vmcnt(0) 597; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp 598; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 599; GFX11-NEXT: s_endpgm 600; 601; GFX12-LABEL: v_clamp_f16: 602; GFX12: ; %bb.0: 603; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 604; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 605; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 606; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 607; GFX12-NEXT: s_wait_kmcnt 0x0 608; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] 609; GFX12-NEXT: s_wait_loadcnt 0x0 610; GFX12-NEXT: v_max_num_f16_e64 v1, v1, v1 clamp 611; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] 612; GFX12-NEXT: s_endpgm 613 %tid = call i32 @llvm.amdgcn.workitem.id.x() 614 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid 615 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 616 %a = load half, ptr addrspace(1) %gep0 617 %max = call half @llvm.maxnum.f16(half %a, half 0.0) 618 %med = call half @llvm.minnum.f16(half %max, half 1.0) 619 620 store half %med, ptr addrspace(1) %out.gep 621 ret void 622} 623 624define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 625; GFX6-LABEL: v_clamp_neg_f16: 626; GFX6: ; %bb.0: 627; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 628; GFX6-NEXT: s_mov_b32 s7, 0xf000 629; GFX6-NEXT: s_mov_b32 s6, 0 630; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 631; GFX6-NEXT: v_mov_b32_e32 v1, 0 632; GFX6-NEXT: s_waitcnt lgkmcnt(0) 633; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 634; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 635; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 636; GFX6-NEXT: s_waitcnt vmcnt(0) 637; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp 638; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 639; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 640; GFX6-NEXT: s_endpgm 641; 642; GFX8-LABEL: v_clamp_neg_f16: 643; GFX8: ; %bb.0: 644; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 645; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 646; GFX8-NEXT: s_waitcnt lgkmcnt(0) 647; GFX8-NEXT: v_mov_b32_e32 v1, s3 648; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 649; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 650; GFX8-NEXT: flat_load_ushort v3, v[0:1] 651; GFX8-NEXT: v_mov_b32_e32 v1, s1 652; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 653; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 654; GFX8-NEXT: s_waitcnt vmcnt(0) 655; GFX8-NEXT: v_max_f16_e64 v2, -v3, -v3 clamp 656; GFX8-NEXT: flat_store_short v[0:1], v2 657; GFX8-NEXT: s_endpgm 658; 659; GFX9-LABEL: v_clamp_neg_f16: 660; GFX9: ; %bb.0: 661; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 662; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 663; GFX9-NEXT: s_waitcnt lgkmcnt(0) 664; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 665; GFX9-NEXT: s_waitcnt vmcnt(0) 666; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp 667; GFX9-NEXT: global_store_short v0, v1, s[0:1] 668; GFX9-NEXT: s_endpgm 669; 670; GFX11-LABEL: v_clamp_neg_f16: 671; GFX11: ; %bb.0: 672; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 673; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 674; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 675; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 676; GFX11-NEXT: s_waitcnt lgkmcnt(0) 677; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 678; GFX11-NEXT: s_waitcnt vmcnt(0) 679; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp 680; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 681; GFX11-NEXT: s_endpgm 682; 683; GFX12-LABEL: v_clamp_neg_f16: 684; GFX12: ; %bb.0: 685; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 686; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 687; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 688; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 689; GFX12-NEXT: s_wait_kmcnt 0x0 690; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] 691; GFX12-NEXT: s_wait_loadcnt 0x0 692; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 clamp 693; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] 694; GFX12-NEXT: s_endpgm 695 %tid = call i32 @llvm.amdgcn.workitem.id.x() 696 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid 697 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 698 %a = load half, ptr addrspace(1) %gep0 699 %fneg.a = fneg half %a 700 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0) 701 %med = call half @llvm.minnum.f16(half %max, half 1.0) 702 703 store half %med, ptr addrspace(1) %out.gep 704 ret void 705} 706 707define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 708; GFX6-LABEL: v_clamp_negabs_f16: 709; GFX6: ; %bb.0: 710; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 711; GFX6-NEXT: s_mov_b32 s7, 0xf000 712; GFX6-NEXT: s_mov_b32 s6, 0 713; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 714; GFX6-NEXT: v_mov_b32_e32 v1, 0 715; GFX6-NEXT: s_waitcnt lgkmcnt(0) 716; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 717; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 718; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 719; GFX6-NEXT: s_waitcnt vmcnt(0) 720; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| clamp 721; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 722; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 723; GFX6-NEXT: s_endpgm 724; 725; GFX8-LABEL: v_clamp_negabs_f16: 726; GFX8: ; %bb.0: 727; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 728; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 729; GFX8-NEXT: s_waitcnt lgkmcnt(0) 730; GFX8-NEXT: v_mov_b32_e32 v1, s3 731; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 732; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 733; GFX8-NEXT: flat_load_ushort v3, v[0:1] 734; GFX8-NEXT: v_mov_b32_e32 v1, s1 735; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 736; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 737; GFX8-NEXT: s_waitcnt vmcnt(0) 738; GFX8-NEXT: v_max_f16_e64 v2, -|v3|, -|v3| clamp 739; GFX8-NEXT: flat_store_short v[0:1], v2 740; GFX8-NEXT: s_endpgm 741; 742; GFX9-LABEL: v_clamp_negabs_f16: 743; GFX9: ; %bb.0: 744; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 745; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 746; GFX9-NEXT: s_waitcnt lgkmcnt(0) 747; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 748; GFX9-NEXT: s_waitcnt vmcnt(0) 749; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp 750; GFX9-NEXT: global_store_short v0, v1, s[0:1] 751; GFX9-NEXT: s_endpgm 752; 753; GFX11-LABEL: v_clamp_negabs_f16: 754; GFX11: ; %bb.0: 755; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 756; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 757; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 758; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 759; GFX11-NEXT: s_waitcnt lgkmcnt(0) 760; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 761; GFX11-NEXT: s_waitcnt vmcnt(0) 762; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp 763; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 764; GFX11-NEXT: s_endpgm 765; 766; GFX12-LABEL: v_clamp_negabs_f16: 767; GFX12: ; %bb.0: 768; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 769; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 770; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 771; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 772; GFX12-NEXT: s_wait_kmcnt 0x0 773; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] 774; GFX12-NEXT: s_wait_loadcnt 0x0 775; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| clamp 776; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] 777; GFX12-NEXT: s_endpgm 778 %tid = call i32 @llvm.amdgcn.workitem.id.x() 779 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid 780 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 781 %a = load half, ptr addrspace(1) %gep0 782 %fabs.a = call half @llvm.fabs.f16(half %a) 783 %fneg.fabs.a = fneg half %fabs.a 784 785 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0) 786 %med = call half @llvm.minnum.f16(half %max, half 1.0) 787 788 store half %med, ptr addrspace(1) %out.gep 789 ret void 790} 791 792define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 793; GFX6-LABEL: v_clamp_f64: 794; GFX6: ; %bb.0: 795; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 796; GFX6-NEXT: s_mov_b32 s7, 0xf000 797; GFX6-NEXT: s_mov_b32 s6, 0 798; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 799; GFX6-NEXT: v_mov_b32_e32 v1, 0 800; GFX6-NEXT: s_waitcnt lgkmcnt(0) 801; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 802; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 803; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 804; GFX6-NEXT: s_waitcnt vmcnt(0) 805; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] clamp 806; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 807; GFX6-NEXT: s_endpgm 808; 809; GFX8-LABEL: v_clamp_f64: 810; GFX8: ; %bb.0: 811; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 812; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 813; GFX8-NEXT: s_waitcnt lgkmcnt(0) 814; GFX8-NEXT: v_mov_b32_e32 v1, s3 815; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 816; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 817; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 818; GFX8-NEXT: v_mov_b32_e32 v3, s1 819; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 820; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 821; GFX8-NEXT: s_waitcnt vmcnt(0) 822; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp 823; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 824; GFX8-NEXT: s_endpgm 825; 826; GFX9-LABEL: v_clamp_f64: 827; GFX9: ; %bb.0: 828; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 829; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 830; GFX9-NEXT: s_waitcnt lgkmcnt(0) 831; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 832; GFX9-NEXT: s_waitcnt vmcnt(0) 833; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp 834; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 835; GFX9-NEXT: s_endpgm 836; 837; GFX11-LABEL: v_clamp_f64: 838; GFX11: ; %bb.0: 839; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 840; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 841; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 842; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 843; GFX11-NEXT: s_waitcnt lgkmcnt(0) 844; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 845; GFX11-NEXT: s_waitcnt vmcnt(0) 846; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp 847; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 848; GFX11-NEXT: s_endpgm 849; 850; GFX12-LABEL: v_clamp_f64: 851; GFX12: ; %bb.0: 852; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 853; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 854; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 855; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 856; GFX12-NEXT: s_wait_kmcnt 0x0 857; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] 858; GFX12-NEXT: s_wait_loadcnt 0x0 859; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp 860; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 861; GFX12-NEXT: s_endpgm 862 %tid = call i32 @llvm.amdgcn.workitem.id.x() 863 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid 864 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid 865 %a = load double, ptr addrspace(1) %gep0 866 %max = call double @llvm.maxnum.f64(double %a, double 0.0) 867 %med = call double @llvm.minnum.f64(double %max, double 1.0) 868 869 store double %med, ptr addrspace(1) %out.gep 870 ret void 871} 872 873define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 874; GFX6-LABEL: v_clamp_neg_f64: 875; GFX6: ; %bb.0: 876; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 877; GFX6-NEXT: s_mov_b32 s7, 0xf000 878; GFX6-NEXT: s_mov_b32 s6, 0 879; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 880; GFX6-NEXT: v_mov_b32_e32 v1, 0 881; GFX6-NEXT: s_waitcnt lgkmcnt(0) 882; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 883; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 884; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 885; GFX6-NEXT: s_waitcnt vmcnt(0) 886; GFX6-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] clamp 887; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 888; GFX6-NEXT: s_endpgm 889; 890; GFX8-LABEL: v_clamp_neg_f64: 891; GFX8: ; %bb.0: 892; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 893; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 894; GFX8-NEXT: s_waitcnt lgkmcnt(0) 895; GFX8-NEXT: v_mov_b32_e32 v1, s3 896; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 897; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 898; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 899; GFX8-NEXT: v_mov_b32_e32 v3, s1 900; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 901; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 902; GFX8-NEXT: s_waitcnt vmcnt(0) 903; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp 904; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 905; GFX8-NEXT: s_endpgm 906; 907; GFX9-LABEL: v_clamp_neg_f64: 908; GFX9: ; %bb.0: 909; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 910; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 911; GFX9-NEXT: s_waitcnt lgkmcnt(0) 912; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 913; GFX9-NEXT: s_waitcnt vmcnt(0) 914; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp 915; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 916; GFX9-NEXT: s_endpgm 917; 918; GFX11-LABEL: v_clamp_neg_f64: 919; GFX11: ; %bb.0: 920; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 921; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 922; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 923; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 924; GFX11-NEXT: s_waitcnt lgkmcnt(0) 925; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 926; GFX11-NEXT: s_waitcnt vmcnt(0) 927; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp 928; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 929; GFX11-NEXT: s_endpgm 930; 931; GFX12-LABEL: v_clamp_neg_f64: 932; GFX12: ; %bb.0: 933; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 934; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 935; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 936; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 937; GFX12-NEXT: s_wait_kmcnt 0x0 938; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] 939; GFX12-NEXT: s_wait_loadcnt 0x0 940; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp 941; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 942; GFX12-NEXT: s_endpgm 943 %tid = call i32 @llvm.amdgcn.workitem.id.x() 944 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid 945 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid 946 %a = load double, ptr addrspace(1) %gep0 947 %fneg.a = fneg double %a 948 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0) 949 %med = call double @llvm.minnum.f64(double %max, double 1.0) 950 951 store double %med, ptr addrspace(1) %out.gep 952 ret void 953} 954 955define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 956; GFX6-LABEL: v_clamp_negabs_f64: 957; GFX6: ; %bb.0: 958; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 959; GFX6-NEXT: s_mov_b32 s7, 0xf000 960; GFX6-NEXT: s_mov_b32 s6, 0 961; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 962; GFX6-NEXT: v_mov_b32_e32 v1, 0 963; GFX6-NEXT: s_waitcnt lgkmcnt(0) 964; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 965; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 966; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 967; GFX6-NEXT: s_waitcnt vmcnt(0) 968; GFX6-NEXT: v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| clamp 969; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 970; GFX6-NEXT: s_endpgm 971; 972; GFX8-LABEL: v_clamp_negabs_f64: 973; GFX8: ; %bb.0: 974; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 975; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 976; GFX8-NEXT: s_waitcnt lgkmcnt(0) 977; GFX8-NEXT: v_mov_b32_e32 v1, s3 978; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 979; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 980; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 981; GFX8-NEXT: v_mov_b32_e32 v3, s1 982; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 983; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 984; GFX8-NEXT: s_waitcnt vmcnt(0) 985; GFX8-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 986; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 987; GFX8-NEXT: s_endpgm 988; 989; GFX9-LABEL: v_clamp_negabs_f64: 990; GFX9: ; %bb.0: 991; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 992; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 993; GFX9-NEXT: s_waitcnt lgkmcnt(0) 994; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 995; GFX9-NEXT: s_waitcnt vmcnt(0) 996; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 997; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 998; GFX9-NEXT: s_endpgm 999; 1000; GFX11-LABEL: v_clamp_negabs_f64: 1001; GFX11: ; %bb.0: 1002; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1003; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1004; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1005; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1006; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1008; GFX11-NEXT: s_waitcnt vmcnt(0) 1009; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 1010; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1011; GFX11-NEXT: s_endpgm 1012; 1013; GFX12-LABEL: v_clamp_negabs_f64: 1014; GFX12: ; %bb.0: 1015; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1016; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1017; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1018; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1019; GFX12-NEXT: s_wait_kmcnt 0x0 1020; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1021; GFX12-NEXT: s_wait_loadcnt 0x0 1022; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 1023; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1024; GFX12-NEXT: s_endpgm 1025 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1026 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid 1027 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid 1028 %a = load double, ptr addrspace(1) %gep0 1029 %fabs.a = call double @llvm.fabs.f64(double %a) 1030 %fneg.fabs.a = fneg double %fabs.a 1031 1032 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0) 1033 %med = call double @llvm.minnum.f64(double %max, double 1.0) 1034 1035 store double %med, ptr addrspace(1) %out.gep 1036 ret void 1037} 1038 1039define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1040; GFX6-LABEL: v_clamp_med3_aby_negzero_f32: 1041; GFX6: ; %bb.0: 1042; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1043; GFX6-NEXT: s_mov_b32 s7, 0xf000 1044; GFX6-NEXT: s_mov_b32 s6, 0 1045; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1046; GFX6-NEXT: v_mov_b32_e32 v1, 0 1047; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1048; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1049; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1050; GFX6-NEXT: s_brev_b32 s4, 1 1051; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1052; GFX6-NEXT: s_waitcnt vmcnt(0) 1053; GFX6-NEXT: v_med3_f32 v2, s4, 1.0, v2 1054; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1055; GFX6-NEXT: s_endpgm 1056; 1057; GFX8-LABEL: v_clamp_med3_aby_negzero_f32: 1058; GFX8: ; %bb.0: 1059; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1060; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1061; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1062; GFX8-NEXT: v_mov_b32_e32 v1, s3 1063; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1064; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1065; GFX8-NEXT: flat_load_dword v3, v[0:1] 1066; GFX8-NEXT: v_mov_b32_e32 v1, s1 1067; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1068; GFX8-NEXT: s_brev_b32 s0, 1 1069; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1070; GFX8-NEXT: s_waitcnt vmcnt(0) 1071; GFX8-NEXT: v_med3_f32 v2, s0, 1.0, v3 1072; GFX8-NEXT: flat_store_dword v[0:1], v2 1073; GFX8-NEXT: s_endpgm 1074; 1075; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: 1076; GFX9: ; %bb.0: 1077; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1078; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1079; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1080; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1081; GFX9-NEXT: s_brev_b32 s2, 1 1082; GFX9-NEXT: s_waitcnt vmcnt(0) 1083; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1 1084; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1085; GFX9-NEXT: s_endpgm 1086; 1087; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: 1088; GFX11: ; %bb.0: 1089; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1090; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1091; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1092; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1093; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1095; GFX11-NEXT: s_waitcnt vmcnt(0) 1096; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1 1097; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1098; GFX11-NEXT: s_endpgm 1099; 1100; GFX12-LABEL: v_clamp_med3_aby_negzero_f32: 1101; GFX12: ; %bb.0: 1102; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1103; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1104; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1105; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1106; GFX12-NEXT: s_wait_kmcnt 0x0 1107; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1108; GFX12-NEXT: s_wait_loadcnt 0x0 1109; GFX12-NEXT: v_med3_num_f32 v1, 0x80000000, 1.0, v1 1110; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1111; GFX12-NEXT: s_endpgm 1112 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1113 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1114 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1115 %a = load float, ptr addrspace(1) %gep0 1116 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a) 1117 store float %med, ptr addrspace(1) %out.gep 1118 ret void 1119} 1120 1121define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1122; GFX6-LABEL: v_clamp_med3_aby_f32: 1123; GFX6: ; %bb.0: 1124; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1125; GFX6-NEXT: s_mov_b32 s7, 0xf000 1126; GFX6-NEXT: s_mov_b32 s6, 0 1127; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1128; GFX6-NEXT: v_mov_b32_e32 v1, 0 1129; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1131; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1132; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1133; GFX6-NEXT: s_waitcnt vmcnt(0) 1134; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1135; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1136; GFX6-NEXT: s_endpgm 1137; 1138; GFX8-LABEL: v_clamp_med3_aby_f32: 1139; GFX8: ; %bb.0: 1140; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1141; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1142; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX8-NEXT: v_mov_b32_e32 v1, s3 1144; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1145; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1146; GFX8-NEXT: flat_load_dword v3, v[0:1] 1147; GFX8-NEXT: v_mov_b32_e32 v1, s1 1148; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1149; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1150; GFX8-NEXT: s_waitcnt vmcnt(0) 1151; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1152; GFX8-NEXT: flat_store_dword v[0:1], v2 1153; GFX8-NEXT: s_endpgm 1154; 1155; GFX9-LABEL: v_clamp_med3_aby_f32: 1156; GFX9: ; %bb.0: 1157; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1158; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1159; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1161; GFX9-NEXT: s_waitcnt vmcnt(0) 1162; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1163; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1164; GFX9-NEXT: s_endpgm 1165; 1166; GFX11-LABEL: v_clamp_med3_aby_f32: 1167; GFX11: ; %bb.0: 1168; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1169; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1171; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1172; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1173; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1174; GFX11-NEXT: s_waitcnt vmcnt(0) 1175; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1176; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1177; GFX11-NEXT: s_endpgm 1178; 1179; GFX12-LABEL: v_clamp_med3_aby_f32: 1180; GFX12: ; %bb.0: 1181; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1182; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1183; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1184; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1185; GFX12-NEXT: s_wait_kmcnt 0x0 1186; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1187; GFX12-NEXT: s_wait_loadcnt 0x0 1188; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 1189; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1190; GFX12-NEXT: s_endpgm 1191 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1192 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1193 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1194 %a = load float, ptr addrspace(1) %gep0 1195 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 1196 store float %med, ptr addrspace(1) %out.gep 1197 ret void 1198} 1199 1200define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1201; GFX6-LABEL: v_clamp_med3_bay_f32: 1202; GFX6: ; %bb.0: 1203; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1204; GFX6-NEXT: s_mov_b32 s7, 0xf000 1205; GFX6-NEXT: s_mov_b32 s6, 0 1206; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1207; GFX6-NEXT: v_mov_b32_e32 v1, 0 1208; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1210; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1211; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1212; GFX6-NEXT: s_waitcnt vmcnt(0) 1213; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1214; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1215; GFX6-NEXT: s_endpgm 1216; 1217; GFX8-LABEL: v_clamp_med3_bay_f32: 1218; GFX8: ; %bb.0: 1219; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1220; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1221; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX8-NEXT: v_mov_b32_e32 v1, s3 1223; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1224; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1225; GFX8-NEXT: flat_load_dword v3, v[0:1] 1226; GFX8-NEXT: v_mov_b32_e32 v1, s1 1227; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1228; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1229; GFX8-NEXT: s_waitcnt vmcnt(0) 1230; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1231; GFX8-NEXT: flat_store_dword v[0:1], v2 1232; GFX8-NEXT: s_endpgm 1233; 1234; GFX9-LABEL: v_clamp_med3_bay_f32: 1235; GFX9: ; %bb.0: 1236; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1237; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1238; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1239; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1240; GFX9-NEXT: s_waitcnt vmcnt(0) 1241; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1242; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1243; GFX9-NEXT: s_endpgm 1244; 1245; GFX11-LABEL: v_clamp_med3_bay_f32: 1246; GFX11: ; %bb.0: 1247; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1248; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1249; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1250; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1251; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1253; GFX11-NEXT: s_waitcnt vmcnt(0) 1254; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1255; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1256; GFX11-NEXT: s_endpgm 1257; 1258; GFX12-LABEL: v_clamp_med3_bay_f32: 1259; GFX12: ; %bb.0: 1260; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1261; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1262; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1263; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1264; GFX12-NEXT: s_wait_kmcnt 0x0 1265; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1266; GFX12-NEXT: s_wait_loadcnt 0x0 1267; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 1268; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1269; GFX12-NEXT: s_endpgm 1270 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1271 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1272 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1273 %a = load float, ptr addrspace(1) %gep0 1274 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 1275 store float %med, ptr addrspace(1) %out.gep 1276 ret void 1277} 1278 1279define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1280; GFX6-LABEL: v_clamp_med3_yab_f32: 1281; GFX6: ; %bb.0: 1282; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1283; GFX6-NEXT: s_mov_b32 s7, 0xf000 1284; GFX6-NEXT: s_mov_b32 s6, 0 1285; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1286; GFX6-NEXT: v_mov_b32_e32 v1, 0 1287; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1289; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1290; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1291; GFX6-NEXT: s_waitcnt vmcnt(0) 1292; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1293; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1294; GFX6-NEXT: s_endpgm 1295; 1296; GFX8-LABEL: v_clamp_med3_yab_f32: 1297; GFX8: ; %bb.0: 1298; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1299; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1300; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX8-NEXT: v_mov_b32_e32 v1, s3 1302; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1303; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1304; GFX8-NEXT: flat_load_dword v3, v[0:1] 1305; GFX8-NEXT: v_mov_b32_e32 v1, s1 1306; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1307; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1308; GFX8-NEXT: s_waitcnt vmcnt(0) 1309; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1310; GFX8-NEXT: flat_store_dword v[0:1], v2 1311; GFX8-NEXT: s_endpgm 1312; 1313; GFX9-LABEL: v_clamp_med3_yab_f32: 1314; GFX9: ; %bb.0: 1315; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1316; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1317; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1318; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1319; GFX9-NEXT: s_waitcnt vmcnt(0) 1320; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1321; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1322; GFX9-NEXT: s_endpgm 1323; 1324; GFX11-LABEL: v_clamp_med3_yab_f32: 1325; GFX11: ; %bb.0: 1326; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1327; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1328; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1329; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1330; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1332; GFX11-NEXT: s_waitcnt vmcnt(0) 1333; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1334; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1335; GFX11-NEXT: s_endpgm 1336; 1337; GFX12-LABEL: v_clamp_med3_yab_f32: 1338; GFX12: ; %bb.0: 1339; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1340; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1341; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1342; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1343; GFX12-NEXT: s_wait_kmcnt 0x0 1344; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1345; GFX12-NEXT: s_wait_loadcnt 0x0 1346; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 1347; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1348; GFX12-NEXT: s_endpgm 1349 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1350 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1351 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1352 %a = load float, ptr addrspace(1) %gep0 1353 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 1354 store float %med, ptr addrspace(1) %out.gep 1355 ret void 1356} 1357 1358define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1359; GFX6-LABEL: v_clamp_med3_yba_f32: 1360; GFX6: ; %bb.0: 1361; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1362; GFX6-NEXT: s_mov_b32 s7, 0xf000 1363; GFX6-NEXT: s_mov_b32 s6, 0 1364; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1365; GFX6-NEXT: v_mov_b32_e32 v1, 0 1366; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1368; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1369; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1370; GFX6-NEXT: s_waitcnt vmcnt(0) 1371; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1372; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1373; GFX6-NEXT: s_endpgm 1374; 1375; GFX8-LABEL: v_clamp_med3_yba_f32: 1376; GFX8: ; %bb.0: 1377; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1378; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1379; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX8-NEXT: v_mov_b32_e32 v1, s3 1381; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1382; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1383; GFX8-NEXT: flat_load_dword v3, v[0:1] 1384; GFX8-NEXT: v_mov_b32_e32 v1, s1 1385; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1386; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1387; GFX8-NEXT: s_waitcnt vmcnt(0) 1388; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1389; GFX8-NEXT: flat_store_dword v[0:1], v2 1390; GFX8-NEXT: s_endpgm 1391; 1392; GFX9-LABEL: v_clamp_med3_yba_f32: 1393; GFX9: ; %bb.0: 1394; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1395; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1398; GFX9-NEXT: s_waitcnt vmcnt(0) 1399; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1400; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1401; GFX9-NEXT: s_endpgm 1402; 1403; GFX11-LABEL: v_clamp_med3_yba_f32: 1404; GFX11: ; %bb.0: 1405; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1406; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1407; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1408; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1409; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1410; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1411; GFX11-NEXT: s_waitcnt vmcnt(0) 1412; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1413; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1414; GFX11-NEXT: s_endpgm 1415; 1416; GFX12-LABEL: v_clamp_med3_yba_f32: 1417; GFX12: ; %bb.0: 1418; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1419; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1420; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1421; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1422; GFX12-NEXT: s_wait_kmcnt 0x0 1423; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1424; GFX12-NEXT: s_wait_loadcnt 0x0 1425; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 1426; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1427; GFX12-NEXT: s_endpgm 1428 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1429 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1430 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1431 %a = load float, ptr addrspace(1) %gep0 1432 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 1433 store float %med, ptr addrspace(1) %out.gep 1434 ret void 1435} 1436 1437define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1438; GFX6-LABEL: v_clamp_med3_ayb_f32: 1439; GFX6: ; %bb.0: 1440; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1441; GFX6-NEXT: s_mov_b32 s7, 0xf000 1442; GFX6-NEXT: s_mov_b32 s6, 0 1443; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1444; GFX6-NEXT: v_mov_b32_e32 v1, 0 1445; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1446; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1447; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1448; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1449; GFX6-NEXT: s_waitcnt vmcnt(0) 1450; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1451; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1452; GFX6-NEXT: s_endpgm 1453; 1454; GFX8-LABEL: v_clamp_med3_ayb_f32: 1455; GFX8: ; %bb.0: 1456; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1457; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1458; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1459; GFX8-NEXT: v_mov_b32_e32 v1, s3 1460; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1461; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1462; GFX8-NEXT: flat_load_dword v3, v[0:1] 1463; GFX8-NEXT: v_mov_b32_e32 v1, s1 1464; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1465; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1466; GFX8-NEXT: s_waitcnt vmcnt(0) 1467; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1468; GFX8-NEXT: flat_store_dword v[0:1], v2 1469; GFX8-NEXT: s_endpgm 1470; 1471; GFX9-LABEL: v_clamp_med3_ayb_f32: 1472; GFX9: ; %bb.0: 1473; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1474; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1477; GFX9-NEXT: s_waitcnt vmcnt(0) 1478; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1479; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1480; GFX9-NEXT: s_endpgm 1481; 1482; GFX11-LABEL: v_clamp_med3_ayb_f32: 1483; GFX11: ; %bb.0: 1484; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1485; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1486; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1487; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1488; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1489; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1490; GFX11-NEXT: s_waitcnt vmcnt(0) 1491; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1492; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1493; GFX11-NEXT: s_endpgm 1494; 1495; GFX12-LABEL: v_clamp_med3_ayb_f32: 1496; GFX12: ; %bb.0: 1497; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1498; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1499; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1500; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1501; GFX12-NEXT: s_wait_kmcnt 0x0 1502; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1503; GFX12-NEXT: s_wait_loadcnt 0x0 1504; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 1505; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1506; GFX12-NEXT: s_endpgm 1507 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1508 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1509 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1510 %a = load float, ptr addrspace(1) %gep0 1511 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 1512 store float %med, ptr addrspace(1) %out.gep 1513 ret void 1514} 1515 1516define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1517; GFX6-LABEL: v_clamp_med3_bya_f32: 1518; GFX6: ; %bb.0: 1519; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1520; GFX6-NEXT: s_mov_b32 s7, 0xf000 1521; GFX6-NEXT: s_mov_b32 s6, 0 1522; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1523; GFX6-NEXT: v_mov_b32_e32 v1, 0 1524; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1526; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1527; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1528; GFX6-NEXT: s_waitcnt vmcnt(0) 1529; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1530; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1531; GFX6-NEXT: s_endpgm 1532; 1533; GFX8-LABEL: v_clamp_med3_bya_f32: 1534; GFX8: ; %bb.0: 1535; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1536; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1537; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX8-NEXT: v_mov_b32_e32 v1, s3 1539; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1540; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1541; GFX8-NEXT: flat_load_dword v3, v[0:1] 1542; GFX8-NEXT: v_mov_b32_e32 v1, s1 1543; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1544; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1545; GFX8-NEXT: s_waitcnt vmcnt(0) 1546; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1547; GFX8-NEXT: flat_store_dword v[0:1], v2 1548; GFX8-NEXT: s_endpgm 1549; 1550; GFX9-LABEL: v_clamp_med3_bya_f32: 1551; GFX9: ; %bb.0: 1552; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1553; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1554; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1555; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1556; GFX9-NEXT: s_waitcnt vmcnt(0) 1557; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1558; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1559; GFX9-NEXT: s_endpgm 1560; 1561; GFX11-LABEL: v_clamp_med3_bya_f32: 1562; GFX11: ; %bb.0: 1563; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1564; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1565; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1566; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1567; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1568; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1569; GFX11-NEXT: s_waitcnt vmcnt(0) 1570; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1571; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1572; GFX11-NEXT: s_endpgm 1573; 1574; GFX12-LABEL: v_clamp_med3_bya_f32: 1575; GFX12: ; %bb.0: 1576; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1577; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1578; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1579; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1580; GFX12-NEXT: s_wait_kmcnt 0x0 1581; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 1582; GFX12-NEXT: s_wait_loadcnt 0x0 1583; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 1584; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1585; GFX12-NEXT: s_endpgm 1586 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1587 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 1588 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1589 %a = load float, ptr addrspace(1) %gep0 1590 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 1591 store float %med, ptr addrspace(1) %out.gep 1592 ret void 1593} 1594 1595define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 { 1596; GFX6-LABEL: v_clamp_constants_to_one_f32: 1597; GFX6: ; %bb.0: 1598; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1599; GFX6-NEXT: s_mov_b32 s3, 0xf000 1600; GFX6-NEXT: s_mov_b32 s2, 0 1601; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1602; GFX6-NEXT: v_mov_b32_e32 v1, 0 1603; GFX6-NEXT: v_mov_b32_e32 v2, 1.0 1604; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1605; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1606; GFX6-NEXT: s_endpgm 1607; 1608; GFX8-LABEL: v_clamp_constants_to_one_f32: 1609; GFX8: ; %bb.0: 1610; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1611; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1612; GFX8-NEXT: v_mov_b32_e32 v2, 1.0 1613; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX8-NEXT: v_mov_b32_e32 v1, s1 1615; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1616; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1617; GFX8-NEXT: flat_store_dword v[0:1], v2 1618; GFX8-NEXT: s_endpgm 1619; 1620; GFX9-LABEL: v_clamp_constants_to_one_f32: 1621; GFX9: ; %bb.0: 1622; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1623; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1624; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 1625; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1626; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1627; GFX9-NEXT: s_endpgm 1628; 1629; GFX11-LABEL: v_clamp_constants_to_one_f32: 1630; GFX11: ; %bb.0: 1631; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1632; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 1633; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1634; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1635; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1637; GFX11-NEXT: s_endpgm 1638; 1639; GFX12-LABEL: v_clamp_constants_to_one_f32: 1640; GFX12: ; %bb.0: 1641; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1642; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 1643; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1644; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1645; GFX12-NEXT: s_wait_kmcnt 0x0 1646; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1647; GFX12-NEXT: s_endpgm 1648 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1649 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1650 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0) 1651 store float %med, ptr addrspace(1) %out.gep 1652 ret void 1653} 1654 1655define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 { 1656; GFX6-LABEL: v_clamp_constants_to_zero_f32: 1657; GFX6: ; %bb.0: 1658; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1659; GFX6-NEXT: s_mov_b32 s3, 0xf000 1660; GFX6-NEXT: s_mov_b32 s2, 0 1661; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1662; GFX6-NEXT: v_mov_b32_e32 v1, 0 1663; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1664; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64 1665; GFX6-NEXT: s_endpgm 1666; 1667; GFX8-LABEL: v_clamp_constants_to_zero_f32: 1668; GFX8: ; %bb.0: 1669; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1670; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1671; GFX8-NEXT: v_mov_b32_e32 v2, 0 1672; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1673; GFX8-NEXT: v_mov_b32_e32 v1, s1 1674; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1675; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1676; GFX8-NEXT: flat_store_dword v[0:1], v2 1677; GFX8-NEXT: s_endpgm 1678; 1679; GFX9-LABEL: v_clamp_constants_to_zero_f32: 1680; GFX9: ; %bb.0: 1681; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1682; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1683; GFX9-NEXT: v_mov_b32_e32 v1, 0 1684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1685; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1686; GFX9-NEXT: s_endpgm 1687; 1688; GFX11-LABEL: v_clamp_constants_to_zero_f32: 1689; GFX11: ; %bb.0: 1690; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1691; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1692; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1693; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1694; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1696; GFX11-NEXT: s_endpgm 1697; 1698; GFX12-LABEL: v_clamp_constants_to_zero_f32: 1699; GFX12: ; %bb.0: 1700; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1701; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1702; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1703; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1704; GFX12-NEXT: s_wait_kmcnt 0x0 1705; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1706; GFX12-NEXT: s_endpgm 1707 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1708 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1709 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0) 1710 store float %med, ptr addrspace(1) %out.gep 1711 ret void 1712} 1713 1714define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 { 1715; GFX6-LABEL: v_clamp_constant_preserve_f32: 1716; GFX6: ; %bb.0: 1717; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1718; GFX6-NEXT: s_mov_b32 s3, 0xf000 1719; GFX6-NEXT: s_mov_b32 s2, 0 1720; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1721; GFX6-NEXT: v_mov_b32_e32 v1, 0 1722; GFX6-NEXT: v_mov_b32_e32 v2, 0.5 1723; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1724; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1725; GFX6-NEXT: s_endpgm 1726; 1727; GFX8-LABEL: v_clamp_constant_preserve_f32: 1728; GFX8: ; %bb.0: 1729; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1730; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1731; GFX8-NEXT: v_mov_b32_e32 v2, 0.5 1732; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1733; GFX8-NEXT: v_mov_b32_e32 v1, s1 1734; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1735; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1736; GFX8-NEXT: flat_store_dword v[0:1], v2 1737; GFX8-NEXT: s_endpgm 1738; 1739; GFX9-LABEL: v_clamp_constant_preserve_f32: 1740; GFX9: ; %bb.0: 1741; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1742; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1743; GFX9-NEXT: v_mov_b32_e32 v1, 0.5 1744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1745; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1746; GFX9-NEXT: s_endpgm 1747; 1748; GFX11-LABEL: v_clamp_constant_preserve_f32: 1749; GFX11: ; %bb.0: 1750; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1751; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 1752; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1753; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1754; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1756; GFX11-NEXT: s_endpgm 1757; 1758; GFX12-LABEL: v_clamp_constant_preserve_f32: 1759; GFX12: ; %bb.0: 1760; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1761; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 1762; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1763; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1764; GFX12-NEXT: s_wait_kmcnt 0x0 1765; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1766; GFX12-NEXT: s_endpgm 1767 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1768 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1769 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5) 1770 store float %med, ptr addrspace(1) %out.gep 1771 ret void 1772} 1773 1774define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 { 1775; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32: 1776; GFX6: ; %bb.0: 1777; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1778; GFX6-NEXT: s_mov_b32 s3, 0xf000 1779; GFX6-NEXT: s_mov_b32 s2, 0 1780; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1781; GFX6-NEXT: v_mov_b32_e32 v1, 0 1782; GFX6-NEXT: v_mov_b32_e32 v2, 0x7fffff 1783; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1785; GFX6-NEXT: s_endpgm 1786; 1787; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32: 1788; GFX8: ; %bb.0: 1789; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1790; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1791; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff 1792; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX8-NEXT: v_mov_b32_e32 v1, s1 1794; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1795; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1796; GFX8-NEXT: flat_store_dword v[0:1], v2 1797; GFX8-NEXT: s_endpgm 1798; 1799; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32: 1800; GFX9: ; %bb.0: 1801; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1802; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1803; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff 1804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1805; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1806; GFX9-NEXT: s_endpgm 1807; 1808; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: 1809; GFX11: ; %bb.0: 1810; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1811; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1812; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1813; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 1814; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1816; GFX11-NEXT: s_endpgm 1817; 1818; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32: 1819; GFX12: ; %bb.0: 1820; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1821; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1822; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1823; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 1824; GFX12-NEXT: s_wait_kmcnt 0x0 1825; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1826; GFX12-NEXT: s_endpgm 1827 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1828 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1829 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float)) 1830 store float %med, ptr addrspace(1) %out.gep 1831 ret void 1832} 1833 1834define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { 1835; GFX6-LABEL: v_clamp_constant_qnan_f32: 1836; GFX6: ; %bb.0: 1837; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1838; GFX6-NEXT: s_mov_b32 s3, 0xf000 1839; GFX6-NEXT: s_mov_b32 s2, 0 1840; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1841; GFX6-NEXT: v_mov_b32_e32 v1, 0 1842; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1843; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64 1844; GFX6-NEXT: s_endpgm 1845; 1846; GFX8-LABEL: v_clamp_constant_qnan_f32: 1847; GFX8: ; %bb.0: 1848; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1849; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1850; GFX8-NEXT: v_mov_b32_e32 v2, 0 1851; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1852; GFX8-NEXT: v_mov_b32_e32 v1, s1 1853; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1854; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1855; GFX8-NEXT: flat_store_dword v[0:1], v2 1856; GFX8-NEXT: s_endpgm 1857; 1858; GFX9-LABEL: v_clamp_constant_qnan_f32: 1859; GFX9: ; %bb.0: 1860; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1861; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1862; GFX9-NEXT: v_mov_b32_e32 v1, 0 1863; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1864; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1865; GFX9-NEXT: s_endpgm 1866; 1867; GFX11-LABEL: v_clamp_constant_qnan_f32: 1868; GFX11: ; %bb.0: 1869; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1870; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1871; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1872; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1873; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1874; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1875; GFX11-NEXT: s_endpgm 1876; 1877; GFX12-LABEL: v_clamp_constant_qnan_f32: 1878; GFX12: ; %bb.0: 1879; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1880; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1881; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1882; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1883; GFX12-NEXT: s_wait_kmcnt 0x0 1884; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1885; GFX12-NEXT: s_endpgm 1886 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1887 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1888 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 1889 store float %med, ptr addrspace(1) %out.gep 1890 ret void 1891} 1892 1893define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { 1894; GFX6-LABEL: v_clamp_constant_snan_f32: 1895; GFX6: ; %bb.0: 1896; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1897; GFX6-NEXT: s_mov_b32 s3, 0xf000 1898; GFX6-NEXT: s_mov_b32 s2, 0 1899; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1900; GFX6-NEXT: v_mov_b32_e32 v1, 0 1901; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1902; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64 1903; GFX6-NEXT: s_endpgm 1904; 1905; GFX8-LABEL: v_clamp_constant_snan_f32: 1906; GFX8: ; %bb.0: 1907; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1908; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1909; GFX8-NEXT: v_mov_b32_e32 v2, 0 1910; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX8-NEXT: v_mov_b32_e32 v1, s1 1912; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1913; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1914; GFX8-NEXT: flat_store_dword v[0:1], v2 1915; GFX8-NEXT: s_endpgm 1916; 1917; GFX9-LABEL: v_clamp_constant_snan_f32: 1918; GFX9: ; %bb.0: 1919; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1920; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1921; GFX9-NEXT: v_mov_b32_e32 v1, 0 1922; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1923; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1924; GFX9-NEXT: s_endpgm 1925; 1926; GFX11-LABEL: v_clamp_constant_snan_f32: 1927; GFX11: ; %bb.0: 1928; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1929; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1930; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1931; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1932; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1933; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1934; GFX11-NEXT: s_endpgm 1935; 1936; GFX12-LABEL: v_clamp_constant_snan_f32: 1937; GFX12: ; %bb.0: 1938; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1939; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1940; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1941; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1942; GFX12-NEXT: s_wait_kmcnt 0x0 1943; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1944; GFX12-NEXT: s_endpgm 1945 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1946 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1947 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 1948 store float %med, ptr addrspace(1) %out.gep 1949 ret void 1950} 1951 1952; --------------------------------------------------------------------- 1953; Test non-default behaviors enabling snans and disabling dx10_clamp 1954; --------------------------------------------------------------------- 1955 1956define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 1957; GFX6-LABEL: v_clamp_f32_no_dx10_clamp: 1958; GFX6: ; %bb.0: 1959; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1960; GFX6-NEXT: s_mov_b32 s7, 0xf000 1961; GFX6-NEXT: s_mov_b32 s6, 0 1962; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1963; GFX6-NEXT: v_mov_b32_e32 v1, 0 1964; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1965; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1966; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1967; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1968; GFX6-NEXT: s_waitcnt vmcnt(0) 1969; GFX6-NEXT: v_add_f32_e32 v2, 0.5, v2 1970; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 1971; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1972; GFX6-NEXT: s_endpgm 1973; 1974; GFX8-LABEL: v_clamp_f32_no_dx10_clamp: 1975; GFX8: ; %bb.0: 1976; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1977; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1978; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX8-NEXT: v_mov_b32_e32 v1, s3 1980; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1981; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1982; GFX8-NEXT: flat_load_dword v3, v[0:1] 1983; GFX8-NEXT: v_mov_b32_e32 v1, s1 1984; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1985; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1986; GFX8-NEXT: s_waitcnt vmcnt(0) 1987; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3 1988; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 1989; GFX8-NEXT: flat_store_dword v[0:1], v2 1990; GFX8-NEXT: s_endpgm 1991; 1992; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: 1993; GFX9: ; %bb.0: 1994; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1995; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1996; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1997; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1998; GFX9-NEXT: s_waitcnt vmcnt(0) 1999; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 2000; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 2001; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2002; GFX9-NEXT: s_endpgm 2003; 2004; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: 2005; GFX11: ; %bb.0: 2006; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2007; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2008; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2009; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2010; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2011; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2012; GFX11-NEXT: s_waitcnt vmcnt(0) 2013; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 2014; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 2015; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2016; GFX11-NEXT: s_endpgm 2017; 2018; GFX12-LABEL: v_clamp_f32_no_dx10_clamp: 2019; GFX12: ; %bb.0: 2020; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2021; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2022; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2023; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2024; GFX12-NEXT: s_wait_kmcnt 0x0 2025; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2026; GFX12-NEXT: s_wait_loadcnt 0x0 2027; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp 2028; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2029; GFX12-NEXT: s_endpgm 2030 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2031 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2032 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2033 %a = load float, ptr addrspace(1) %gep0 2034 %a.nnan = fadd nnan float %a, 0.5 2035 %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0) 2036 %med = call float @llvm.minnum.f32(float %max, float 1.0) 2037 2038 store float %med, ptr addrspace(1) %out.gep 2039 ret void 2040} 2041 2042define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { 2043; GFX6-LABEL: v_clamp_f32_snan_dx10clamp: 2044; GFX6: ; %bb.0: 2045; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2046; GFX6-NEXT: s_mov_b32 s7, 0xf000 2047; GFX6-NEXT: s_mov_b32 s6, 0 2048; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2049; GFX6-NEXT: v_mov_b32_e32 v1, 0 2050; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2051; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2052; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2053; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2054; GFX6-NEXT: s_waitcnt vmcnt(0) 2055; GFX6-NEXT: v_add_f32_e64 v2, v2, 0.5 clamp 2056; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2057; GFX6-NEXT: s_endpgm 2058; 2059; GFX8-LABEL: v_clamp_f32_snan_dx10clamp: 2060; GFX8: ; %bb.0: 2061; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2062; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2063; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX8-NEXT: v_mov_b32_e32 v1, s3 2065; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2066; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2067; GFX8-NEXT: flat_load_dword v3, v[0:1] 2068; GFX8-NEXT: v_mov_b32_e32 v1, s1 2069; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2070; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2071; GFX8-NEXT: s_waitcnt vmcnt(0) 2072; GFX8-NEXT: v_add_f32_e64 v2, v3, 0.5 clamp 2073; GFX8-NEXT: flat_store_dword v[0:1], v2 2074; GFX8-NEXT: s_endpgm 2075; 2076; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: 2077; GFX9: ; %bb.0: 2078; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2079; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2080; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2081; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2082; GFX9-NEXT: s_waitcnt vmcnt(0) 2083; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp 2084; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2085; GFX9-NEXT: s_endpgm 2086; 2087; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: 2088; GFX11: ; %bb.0: 2089; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2090; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2091; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2092; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2093; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2095; GFX11-NEXT: s_waitcnt vmcnt(0) 2096; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp 2097; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2098; GFX11-NEXT: s_endpgm 2099; 2100; GFX12-LABEL: v_clamp_f32_snan_dx10clamp: 2101; GFX12: ; %bb.0: 2102; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2103; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2104; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2105; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2106; GFX12-NEXT: s_wait_kmcnt 0x0 2107; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2108; GFX12-NEXT: s_wait_loadcnt 0x0 2109; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp 2110; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2111; GFX12-NEXT: s_endpgm 2112 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2113 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2114 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2115 %a = load float, ptr addrspace(1) %gep0 2116 %add = fadd float %a, 0.5 2117 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 2118 %med = call float @llvm.minnum.f32(float %max, float 1.0) 2119 2120 store float %med, ptr addrspace(1) %out.gep 2121 ret void 2122} 2123 2124define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { 2125; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp: 2126; GFX6: ; %bb.0: 2127; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2128; GFX6-NEXT: s_mov_b32 s7, 0xf000 2129; GFX6-NEXT: s_mov_b32 s6, 0 2130; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2131; GFX6-NEXT: v_mov_b32_e32 v1, 0 2132; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2134; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2135; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2136; GFX6-NEXT: s_waitcnt vmcnt(0) 2137; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 2138; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 2139; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2140; GFX6-NEXT: s_endpgm 2141; 2142; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp: 2143; GFX8: ; %bb.0: 2144; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2145; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2146; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2147; GFX8-NEXT: v_mov_b32_e32 v1, s3 2148; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2149; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2150; GFX8-NEXT: flat_load_dword v3, v[0:1] 2151; GFX8-NEXT: v_mov_b32_e32 v1, s1 2152; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2153; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2154; GFX8-NEXT: s_waitcnt vmcnt(0) 2155; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 2156; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 2157; GFX8-NEXT: flat_store_dword v[0:1], v2 2158; GFX8-NEXT: s_endpgm 2159; 2160; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: 2161; GFX9: ; %bb.0: 2162; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2163; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2165; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2166; GFX9-NEXT: s_waitcnt vmcnt(0) 2167; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2168; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 2169; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2170; GFX9-NEXT: s_endpgm 2171; 2172; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: 2173; GFX11: ; %bb.0: 2174; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2175; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2176; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2177; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2178; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2179; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2180; GFX11-NEXT: s_waitcnt vmcnt(0) 2181; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 2182; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 2183; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2184; GFX11-NEXT: s_endpgm 2185; 2186; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp: 2187; GFX12: ; %bb.0: 2188; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2189; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2190; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2191; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2192; GFX12-NEXT: s_wait_kmcnt 0x0 2193; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2194; GFX12-NEXT: s_wait_loadcnt 0x0 2195; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2196; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2197; GFX12-NEXT: s_endpgm 2198 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2199 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2200 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2201 %a = load float, ptr addrspace(1) %gep0 2202 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 2203 %med = call float @llvm.minnum.f32(float %max, float 1.0) 2204 2205 store float %med, ptr addrspace(1) %out.gep 2206 ret void 2207} 2208 2209define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { 2210; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 2211; GFX6: ; %bb.0: 2212; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2213; GFX6-NEXT: s_mov_b32 s7, 0xf000 2214; GFX6-NEXT: s_mov_b32 s6, 0 2215; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2216; GFX6-NEXT: v_mov_b32_e32 v1, 0 2217; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2218; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2219; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2220; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2221; GFX6-NEXT: s_waitcnt vmcnt(0) 2222; GFX6-NEXT: v_add_f32_e32 v2, 1.0, v2 2223; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 2224; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2225; GFX6-NEXT: s_endpgm 2226; 2227; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 2228; GFX8: ; %bb.0: 2229; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2230; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2231; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2232; GFX8-NEXT: v_mov_b32_e32 v1, s3 2233; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2234; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2235; GFX8-NEXT: flat_load_dword v3, v[0:1] 2236; GFX8-NEXT: v_mov_b32_e32 v1, s1 2237; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2238; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2239; GFX8-NEXT: s_waitcnt vmcnt(0) 2240; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3 2241; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 2242; GFX8-NEXT: flat_store_dword v[0:1], v2 2243; GFX8-NEXT: s_endpgm 2244; 2245; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 2246; GFX9: ; %bb.0: 2247; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2248; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2249; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2250; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2251; GFX9-NEXT: s_waitcnt vmcnt(0) 2252; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 2253; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 2254; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2255; GFX9-NEXT: s_endpgm 2256; 2257; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 2258; GFX11: ; %bb.0: 2259; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2260; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2261; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2262; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2263; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2264; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2265; GFX11-NEXT: s_waitcnt vmcnt(0) 2266; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 2267; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 2268; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2269; GFX11-NEXT: s_endpgm 2270; 2271; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 2272; GFX12: ; %bb.0: 2273; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2274; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2275; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2276; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2277; GFX12-NEXT: s_wait_kmcnt 0x0 2278; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2279; GFX12-NEXT: s_wait_loadcnt 0x0 2280; GFX12-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 2281; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2282; GFX12-NEXT: s_endpgm 2283 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2284 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2285 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2286 %a = load float, ptr addrspace(1) %gep0 2287 %add = fadd nnan float %a, 1.0 2288 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 2289 %med = call float @llvm.minnum.f32(float %max, float 1.0) 2290 2291 store float %med, ptr addrspace(1) %out.gep 2292 ret void 2293} 2294 2295define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 2296; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 2297; GFX6: ; %bb.0: 2298; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2299; GFX6-NEXT: s_mov_b32 s7, 0xf000 2300; GFX6-NEXT: s_mov_b32 s6, 0 2301; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2302; GFX6-NEXT: v_mov_b32_e32 v1, 0 2303; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2304; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2305; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2306; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2307; GFX6-NEXT: s_waitcnt vmcnt(0) 2308; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 2309; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2310; GFX6-NEXT: s_endpgm 2311; 2312; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 2313; GFX8: ; %bb.0: 2314; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2315; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2316; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX8-NEXT: v_mov_b32_e32 v1, s3 2318; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2319; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2320; GFX8-NEXT: flat_load_dword v3, v[0:1] 2321; GFX8-NEXT: v_mov_b32_e32 v1, s1 2322; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2323; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2324; GFX8-NEXT: s_waitcnt vmcnt(0) 2325; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 2326; GFX8-NEXT: flat_store_dword v[0:1], v2 2327; GFX8-NEXT: s_endpgm 2328; 2329; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 2330; GFX9: ; %bb.0: 2331; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2332; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2334; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2335; GFX9-NEXT: s_waitcnt vmcnt(0) 2336; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 2337; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2338; GFX9-NEXT: s_endpgm 2339; 2340; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 2341; GFX11: ; %bb.0: 2342; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2343; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2344; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2345; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2346; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2347; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2348; GFX11-NEXT: s_waitcnt vmcnt(0) 2349; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 2350; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2351; GFX11-NEXT: s_endpgm 2352; 2353; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 2354; GFX12: ; %bb.0: 2355; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2356; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2357; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2358; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2359; GFX12-NEXT: s_wait_kmcnt 0x0 2360; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2361; GFX12-NEXT: s_wait_loadcnt 0x0 2362; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2363; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2364; GFX12-NEXT: s_endpgm 2365 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2366 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2367 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2368 %a = load float, ptr addrspace(1) %gep0 2369 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 2370 store float %med, ptr addrspace(1) %out.gep 2371 ret void 2372} 2373 2374define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 2375; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2376; GFX6: ; %bb.0: 2377; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2378; GFX6-NEXT: s_mov_b32 s7, 0xf000 2379; GFX6-NEXT: s_mov_b32 s6, 0 2380; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2381; GFX6-NEXT: v_mov_b32_e32 v1, 0 2382; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2383; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2384; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2385; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2386; GFX6-NEXT: s_waitcnt vmcnt(0) 2387; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 2388; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2389; GFX6-NEXT: s_endpgm 2390; 2391; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2392; GFX8: ; %bb.0: 2393; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2394; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2395; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2396; GFX8-NEXT: v_mov_b32_e32 v1, s3 2397; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2398; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2399; GFX8-NEXT: flat_load_dword v3, v[0:1] 2400; GFX8-NEXT: v_mov_b32_e32 v1, s1 2401; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2402; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2403; GFX8-NEXT: s_waitcnt vmcnt(0) 2404; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 2405; GFX8-NEXT: flat_store_dword v[0:1], v2 2406; GFX8-NEXT: s_endpgm 2407; 2408; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2409; GFX9: ; %bb.0: 2410; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2411; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2413; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2414; GFX9-NEXT: s_waitcnt vmcnt(0) 2415; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 2416; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2417; GFX9-NEXT: s_endpgm 2418; 2419; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2420; GFX11: ; %bb.0: 2421; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2422; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2423; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2424; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2425; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2426; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2427; GFX11-NEXT: s_waitcnt vmcnt(0) 2428; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 2429; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2430; GFX11-NEXT: s_endpgm 2431; 2432; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2433; GFX12: ; %bb.0: 2434; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2435; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2436; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2437; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2438; GFX12-NEXT: s_wait_kmcnt 0x0 2439; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2440; GFX12-NEXT: s_wait_loadcnt 0x0 2441; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2442; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2443; GFX12-NEXT: s_endpgm 2444 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2445 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2446 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2447 %a = load float, ptr addrspace(1) %gep0 2448 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 2449 store float %med, ptr addrspace(1) %out.gep 2450 ret void 2451} 2452 2453define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 2454; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2455; GFX6: ; %bb.0: 2456; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2457; GFX6-NEXT: s_mov_b32 s7, 0xf000 2458; GFX6-NEXT: s_mov_b32 s6, 0 2459; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2460; GFX6-NEXT: v_mov_b32_e32 v1, 0 2461; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2462; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2463; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2464; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2465; GFX6-NEXT: s_waitcnt vmcnt(0) 2466; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 2467; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2468; GFX6-NEXT: s_endpgm 2469; 2470; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2471; GFX8: ; %bb.0: 2472; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2473; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2474; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2475; GFX8-NEXT: v_mov_b32_e32 v1, s3 2476; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2477; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2478; GFX8-NEXT: flat_load_dword v3, v[0:1] 2479; GFX8-NEXT: v_mov_b32_e32 v1, s1 2480; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2481; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2482; GFX8-NEXT: s_waitcnt vmcnt(0) 2483; GFX8-NEXT: v_med3_f32 v2, v3, 0, 1.0 2484; GFX8-NEXT: flat_store_dword v[0:1], v2 2485; GFX8-NEXT: s_endpgm 2486; 2487; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2488; GFX9: ; %bb.0: 2489; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2490; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2491; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2492; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2493; GFX9-NEXT: s_waitcnt vmcnt(0) 2494; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 2495; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2496; GFX9-NEXT: s_endpgm 2497; 2498; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2499; GFX11: ; %bb.0: 2500; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2501; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2502; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2503; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2504; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2506; GFX11-NEXT: s_waitcnt vmcnt(0) 2507; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 2508; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2509; GFX11-NEXT: s_endpgm 2510; 2511; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2512; GFX12: ; %bb.0: 2513; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2514; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2515; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2516; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2517; GFX12-NEXT: s_wait_kmcnt 0x0 2518; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2519; GFX12-NEXT: s_wait_loadcnt 0x0 2520; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2521; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2522; GFX12-NEXT: s_endpgm 2523 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2524 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2525 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2526 %a = load float, ptr addrspace(1) %gep0 2527 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 2528 store float %med, ptr addrspace(1) %out.gep 2529 ret void 2530} 2531 2532define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 2533; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2534; GFX6: ; %bb.0: 2535; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2536; GFX6-NEXT: s_mov_b32 s7, 0xf000 2537; GFX6-NEXT: s_mov_b32 s6, 0 2538; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2539; GFX6-NEXT: v_mov_b32_e32 v1, 0 2540; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2541; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2542; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2543; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2544; GFX6-NEXT: s_waitcnt vmcnt(0) 2545; GFX6-NEXT: v_med3_f32 v2, v2, 1.0, 0 2546; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2547; GFX6-NEXT: s_endpgm 2548; 2549; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2550; GFX8: ; %bb.0: 2551; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2552; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2553; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2554; GFX8-NEXT: v_mov_b32_e32 v1, s3 2555; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2556; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2557; GFX8-NEXT: flat_load_dword v3, v[0:1] 2558; GFX8-NEXT: v_mov_b32_e32 v1, s1 2559; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2560; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2561; GFX8-NEXT: s_waitcnt vmcnt(0) 2562; GFX8-NEXT: v_med3_f32 v2, v3, 1.0, 0 2563; GFX8-NEXT: flat_store_dword v[0:1], v2 2564; GFX8-NEXT: s_endpgm 2565; 2566; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2567; GFX9: ; %bb.0: 2568; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2569; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2570; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2571; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2572; GFX9-NEXT: s_waitcnt vmcnt(0) 2573; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0 2574; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2575; GFX9-NEXT: s_endpgm 2576; 2577; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2578; GFX11: ; %bb.0: 2579; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2580; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2581; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2582; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2583; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2584; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2585; GFX11-NEXT: s_waitcnt vmcnt(0) 2586; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0 2587; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2588; GFX11-NEXT: s_endpgm 2589; 2590; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2591; GFX12: ; %bb.0: 2592; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2593; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2594; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2595; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2596; GFX12-NEXT: s_wait_kmcnt 0x0 2597; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2598; GFX12-NEXT: s_wait_loadcnt 0x0 2599; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2600; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2601; GFX12-NEXT: s_endpgm 2602 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2603 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2604 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2605 %a = load float, ptr addrspace(1) %gep0 2606 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 2607 store float %med, ptr addrspace(1) %out.gep 2608 ret void 2609} 2610 2611define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 2612; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2613; GFX6: ; %bb.0: 2614; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2615; GFX6-NEXT: s_mov_b32 s7, 0xf000 2616; GFX6-NEXT: s_mov_b32 s6, 0 2617; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2618; GFX6-NEXT: v_mov_b32_e32 v1, 0 2619; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2620; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2621; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2622; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2623; GFX6-NEXT: s_waitcnt vmcnt(0) 2624; GFX6-NEXT: v_med3_f32 v2, 0, v2, 1.0 2625; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2626; GFX6-NEXT: s_endpgm 2627; 2628; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2629; GFX8: ; %bb.0: 2630; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2631; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2632; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2633; GFX8-NEXT: v_mov_b32_e32 v1, s3 2634; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2635; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2636; GFX8-NEXT: flat_load_dword v3, v[0:1] 2637; GFX8-NEXT: v_mov_b32_e32 v1, s1 2638; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2639; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2640; GFX8-NEXT: s_waitcnt vmcnt(0) 2641; GFX8-NEXT: v_med3_f32 v2, 0, v3, 1.0 2642; GFX8-NEXT: flat_store_dword v[0:1], v2 2643; GFX8-NEXT: s_endpgm 2644; 2645; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2646; GFX9: ; %bb.0: 2647; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2648; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2649; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2650; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2651; GFX9-NEXT: s_waitcnt vmcnt(0) 2652; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0 2653; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2654; GFX9-NEXT: s_endpgm 2655; 2656; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2657; GFX11: ; %bb.0: 2658; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2659; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2660; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2661; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2662; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2663; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2664; GFX11-NEXT: s_waitcnt vmcnt(0) 2665; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0 2666; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2667; GFX11-NEXT: s_endpgm 2668; 2669; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2670; GFX12: ; %bb.0: 2671; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2672; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2673; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2674; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2675; GFX12-NEXT: s_wait_kmcnt 0x0 2676; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2677; GFX12-NEXT: s_wait_loadcnt 0x0 2678; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2679; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2680; GFX12-NEXT: s_endpgm 2681 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2682 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2683 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2684 %a = load float, ptr addrspace(1) %gep0 2685 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 2686 store float %med, ptr addrspace(1) %out.gep 2687 ret void 2688} 2689 2690define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 2691; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2692; GFX6: ; %bb.0: 2693; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2694; GFX6-NEXT: s_mov_b32 s7, 0xf000 2695; GFX6-NEXT: s_mov_b32 s6, 0 2696; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2697; GFX6-NEXT: v_mov_b32_e32 v1, 0 2698; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2699; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2700; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2701; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2702; GFX6-NEXT: s_waitcnt vmcnt(0) 2703; GFX6-NEXT: v_med3_f32 v2, 1.0, v2, 0 2704; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2705; GFX6-NEXT: s_endpgm 2706; 2707; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2708; GFX8: ; %bb.0: 2709; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2710; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2711; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2712; GFX8-NEXT: v_mov_b32_e32 v1, s3 2713; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2714; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2715; GFX8-NEXT: flat_load_dword v3, v[0:1] 2716; GFX8-NEXT: v_mov_b32_e32 v1, s1 2717; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2718; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2719; GFX8-NEXT: s_waitcnt vmcnt(0) 2720; GFX8-NEXT: v_med3_f32 v2, 1.0, v3, 0 2721; GFX8-NEXT: flat_store_dword v[0:1], v2 2722; GFX8-NEXT: s_endpgm 2723; 2724; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2725; GFX9: ; %bb.0: 2726; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2727; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2728; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2729; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2730; GFX9-NEXT: s_waitcnt vmcnt(0) 2731; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0 2732; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2733; GFX9-NEXT: s_endpgm 2734; 2735; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2736; GFX11: ; %bb.0: 2737; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2738; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2739; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2740; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2741; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2742; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2743; GFX11-NEXT: s_waitcnt vmcnt(0) 2744; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0 2745; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2746; GFX11-NEXT: s_endpgm 2747; 2748; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2749; GFX12: ; %bb.0: 2750; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2751; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2752; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2753; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2754; GFX12-NEXT: s_wait_kmcnt 0x0 2755; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2756; GFX12-NEXT: s_wait_loadcnt 0x0 2757; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp 2758; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2759; GFX12-NEXT: s_endpgm 2760 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2761 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 2762 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2763 %a = load float, ptr addrspace(1) %gep0 2764 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 2765 store float %med, ptr addrspace(1) %out.gep 2766 ret void 2767} 2768 2769define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { 2770; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2771; GFX6: ; %bb.0: 2772; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2773; GFX6-NEXT: s_mov_b32 s3, 0xf000 2774; GFX6-NEXT: s_mov_b32 s2, 0 2775; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2776; GFX6-NEXT: v_mov_b32_e32 v1, 0 2777; GFX6-NEXT: v_mov_b32_e32 v2, 0x7fc00000 2778; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2779; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2780; GFX6-NEXT: s_endpgm 2781; 2782; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2783; GFX8: ; %bb.0: 2784; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2785; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2786; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 2787; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2788; GFX8-NEXT: v_mov_b32_e32 v1, s1 2789; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2790; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2791; GFX8-NEXT: flat_store_dword v[0:1], v2 2792; GFX8-NEXT: s_endpgm 2793; 2794; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2795; GFX9: ; %bb.0: 2796; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2797; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2798; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 2799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2800; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2801; GFX9-NEXT: s_endpgm 2802; 2803; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2804; GFX11: ; %bb.0: 2805; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2806; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2807; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2808; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 2809; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2810; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2811; GFX11-NEXT: s_endpgm 2812; 2813; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2814; GFX12: ; %bb.0: 2815; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2816; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2817; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2818; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2819; GFX12-NEXT: s_wait_kmcnt 0x0 2820; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2821; GFX12-NEXT: s_endpgm 2822 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2823 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2824 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 2825 store float %med, ptr addrspace(1) %out.gep 2826 ret void 2827} 2828 2829define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { 2830; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2831; GFX6: ; %bb.0: 2832; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2833; GFX6-NEXT: s_mov_b32 s3, 0xf000 2834; GFX6-NEXT: s_mov_b32 s2, 0 2835; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2836; GFX6-NEXT: v_mov_b32_e32 v1, 0 2837; GFX6-NEXT: v_mov_b32_e32 v2, 0x7f800001 2838; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2839; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2840; GFX6-NEXT: s_endpgm 2841; 2842; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2843; GFX8: ; %bb.0: 2844; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2845; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2846; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001 2847; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2848; GFX8-NEXT: v_mov_b32_e32 v1, s1 2849; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2850; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2851; GFX8-NEXT: flat_store_dword v[0:1], v2 2852; GFX8-NEXT: s_endpgm 2853; 2854; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2855; GFX9: ; %bb.0: 2856; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2857; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2858; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001 2859; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2860; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2861; GFX9-NEXT: s_endpgm 2862; 2863; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2864; GFX11: ; %bb.0: 2865; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2866; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2867; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2868; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 2869; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2870; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2871; GFX11-NEXT: s_endpgm 2872; 2873; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2874; GFX12: ; %bb.0: 2875; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2876; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2877; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2878; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2879; GFX12-NEXT: s_wait_kmcnt 0x0 2880; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2881; GFX12-NEXT: s_endpgm 2882 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2883 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 2884 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 2885 store float %med, ptr addrspace(1) %out.gep 2886 ret void 2887} 2888 2889define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 2890; GFX6-LABEL: v_clamp_v2f16: 2891; GFX6: ; %bb.0: 2892; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2893; GFX6-NEXT: s_mov_b32 s7, 0xf000 2894; GFX6-NEXT: s_mov_b32 s6, 0 2895; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2896; GFX6-NEXT: v_mov_b32_e32 v1, 0 2897; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2898; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2899; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2900; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2901; GFX6-NEXT: s_waitcnt vmcnt(0) 2902; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2903; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 2904; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 2905; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2906; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2907; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2908; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2909; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2910; GFX6-NEXT: s_endpgm 2911; 2912; GFX8-LABEL: v_clamp_v2f16: 2913; GFX8: ; %bb.0: 2914; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2915; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2916; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2917; GFX8-NEXT: v_mov_b32_e32 v1, s3 2918; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2919; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2920; GFX8-NEXT: flat_load_dword v3, v[0:1] 2921; GFX8-NEXT: v_mov_b32_e32 v1, s1 2922; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2923; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2924; GFX8-NEXT: s_waitcnt vmcnt(0) 2925; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2926; GFX8-NEXT: v_max_f16_e64 v3, v3, v3 clamp 2927; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2928; GFX8-NEXT: flat_store_dword v[0:1], v2 2929; GFX8-NEXT: s_endpgm 2930; 2931; GFX9-LABEL: v_clamp_v2f16: 2932; GFX9: ; %bb.0: 2933; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2934; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2935; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2936; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2937; GFX9-NEXT: s_waitcnt vmcnt(0) 2938; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 2939; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2940; GFX9-NEXT: s_endpgm 2941; 2942; GFX11-LABEL: v_clamp_v2f16: 2943; GFX11: ; %bb.0: 2944; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2945; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2946; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2947; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2948; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2949; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2950; GFX11-NEXT: s_waitcnt vmcnt(0) 2951; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 2952; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2953; GFX11-NEXT: s_endpgm 2954; 2955; GFX12-LABEL: v_clamp_v2f16: 2956; GFX12: ; %bb.0: 2957; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2958; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2959; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2960; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2961; GFX12-NEXT: s_wait_kmcnt 0x0 2962; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 2963; GFX12-NEXT: s_wait_loadcnt 0x0 2964; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp 2965; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2966; GFX12-NEXT: s_endpgm 2967 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2968 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 2969 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 2970 %a = load <2 x half>, ptr addrspace(1) %gep0 2971 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer) 2972 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 2973 2974 store <2 x half> %med, ptr addrspace(1) %out.gep 2975 ret void 2976} 2977 2978define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 2979; GFX6-LABEL: v_clamp_v2f16_undef_elt: 2980; GFX6: ; %bb.0: 2981; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2982; GFX6-NEXT: s_mov_b32 s7, 0xf000 2983; GFX6-NEXT: s_mov_b32 s6, 0 2984; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2985; GFX6-NEXT: v_mov_b32_e32 v1, 0 2986; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2988; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2989; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 2990; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2991; GFX6-NEXT: s_waitcnt vmcnt(0) 2992; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2993; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 2994; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 2995; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 2996; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 2997; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2998; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 2999; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3000; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3001; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3002; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3003; GFX6-NEXT: s_endpgm 3004; 3005; GFX8-LABEL: v_clamp_v2f16_undef_elt: 3006; GFX8: ; %bb.0: 3007; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3008; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3009; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 3010; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3011; GFX8-NEXT: v_mov_b32_e32 v1, s3 3012; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3013; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3014; GFX8-NEXT: flat_load_dword v3, v[0:1] 3015; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3016; GFX8-NEXT: v_mov_b32_e32 v1, s1 3017; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3018; GFX8-NEXT: s_waitcnt vmcnt(0) 3019; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3020; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 3021; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 3022; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3 3023; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3 3024; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3025; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3026; GFX8-NEXT: flat_store_dword v[0:1], v2 3027; GFX8-NEXT: s_endpgm 3028; 3029; GFX9-LABEL: v_clamp_v2f16_undef_elt: 3030; GFX9: ; %bb.0: 3031; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3032; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3033; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3034; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3035; GFX9-NEXT: s_waitcnt vmcnt(0) 3036; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3037; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3038; GFX9-NEXT: s_endpgm 3039; 3040; GFX11-LABEL: v_clamp_v2f16_undef_elt: 3041; GFX11: ; %bb.0: 3042; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3043; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3044; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3045; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3046; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3047; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3048; GFX11-NEXT: s_waitcnt vmcnt(0) 3049; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3050; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3051; GFX11-NEXT: s_endpgm 3052; 3053; GFX12-LABEL: v_clamp_v2f16_undef_elt: 3054; GFX12: ; %bb.0: 3055; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3056; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3057; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3058; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3059; GFX12-NEXT: s_wait_kmcnt 0x0 3060; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3061; GFX12-NEXT: s_wait_loadcnt 0x0 3062; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp 3063; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3064; GFX12-NEXT: s_endpgm 3065 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3066 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3067 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3068 %a = load <2 x half>, ptr addrspace(1) %gep0 3069 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 3070 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 3071 3072 store <2 x half> %med, ptr addrspace(1) %out.gep 3073 ret void 3074} 3075 3076define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3077; GFX6-LABEL: v_clamp_v2f16_not_zero: 3078; GFX6: ; %bb.0: 3079; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3080; GFX6-NEXT: s_mov_b32 s7, 0xf000 3081; GFX6-NEXT: s_mov_b32 s6, 0 3082; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3083; GFX6-NEXT: v_mov_b32_e32 v1, 0 3084; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3085; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3086; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3087; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3088; GFX6-NEXT: s_waitcnt vmcnt(0) 3089; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3090; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 3091; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3092; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2 3093; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3094; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 3095; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3096; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3097; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3098; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3099; GFX6-NEXT: s_endpgm 3100; 3101; GFX8-LABEL: v_clamp_v2f16_not_zero: 3102; GFX8: ; %bb.0: 3103; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3104; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3105; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3106; GFX8-NEXT: v_mov_b32_e32 v1, s3 3107; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3108; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3109; GFX8-NEXT: flat_load_dword v3, v[0:1] 3110; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3111; GFX8-NEXT: v_mov_b32_e32 v1, s1 3112; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3113; GFX8-NEXT: s_waitcnt vmcnt(0) 3114; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 3115; GFX8-NEXT: v_max_f16_e32 v2, 2.0, v2 3116; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3117; GFX8-NEXT: v_min_f16_e32 v2, 1.0, v2 3118; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 3119; GFX8-NEXT: flat_store_dword v[0:1], v2 3120; GFX8-NEXT: s_endpgm 3121; 3122; GFX9-LABEL: v_clamp_v2f16_not_zero: 3123; GFX9: ; %bb.0: 3124; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3125; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3126; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3127; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3128; GFX9-NEXT: s_waitcnt vmcnt(0) 3129; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 3130; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0 3131; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] 3132; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3133; GFX9-NEXT: s_endpgm 3134; 3135; GFX11-LABEL: v_clamp_v2f16_not_zero: 3136; GFX11: ; %bb.0: 3137; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3138; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3139; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 3140; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3141; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3142; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3143; GFX11-NEXT: s_waitcnt vmcnt(0) 3144; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 3145; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 3146; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3147; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] 3148; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3149; GFX11-NEXT: s_endpgm 3150; 3151; GFX12-LABEL: v_clamp_v2f16_not_zero: 3152; GFX12: ; %bb.0: 3153; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3154; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3155; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 3156; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3157; GFX12-NEXT: s_wait_kmcnt 0x0 3158; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3159; GFX12-NEXT: s_wait_loadcnt 0x0 3160; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 3161; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 3162; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3163; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] 3164; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3165; GFX12-NEXT: s_endpgm 3166 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3167 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3168 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3169 %a = load <2 x half>, ptr addrspace(1) %gep0 3170 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>) 3171 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3172 3173 store <2 x half> %med, ptr addrspace(1) %out.gep 3174 ret void 3175} 3176 3177define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3178; GFX6-LABEL: v_clamp_v2f16_not_one: 3179; GFX6: ; %bb.0: 3180; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3181; GFX6-NEXT: s_mov_b32 s7, 0xf000 3182; GFX6-NEXT: s_mov_b32 s6, 0 3183; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3184; GFX6-NEXT: v_mov_b32_e32 v1, 0 3185; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3186; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3187; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3188; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3189; GFX6-NEXT: s_waitcnt vmcnt(0) 3190; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3191; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3192; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 3193; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3194; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 3195; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3196; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3197; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3198; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3199; GFX6-NEXT: s_endpgm 3200; 3201; GFX8-LABEL: v_clamp_v2f16_not_one: 3202; GFX8: ; %bb.0: 3203; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3204; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3205; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3206; GFX8-NEXT: v_mov_b32_e32 v1, s3 3207; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3208; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3209; GFX8-NEXT: flat_load_dword v3, v[0:1] 3210; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3211; GFX8-NEXT: v_mov_b32_e32 v1, s1 3212; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3213; GFX8-NEXT: s_waitcnt vmcnt(0) 3214; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 3215; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 3216; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3217; GFX8-NEXT: v_min_f16_e32 v2, 0, v2 3218; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 3219; GFX8-NEXT: flat_store_dword v[0:1], v2 3220; GFX8-NEXT: s_endpgm 3221; 3222; GFX9-LABEL: v_clamp_v2f16_not_one: 3223; GFX9: ; %bb.0: 3224; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3225; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3226; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3227; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3228; GFX9-NEXT: s_waitcnt vmcnt(0) 3229; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 3230; GFX9-NEXT: v_pk_max_f16 v1, v1, 0 3231; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] 3232; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3233; GFX9-NEXT: s_endpgm 3234; 3235; GFX11-LABEL: v_clamp_v2f16_not_one: 3236; GFX11: ; %bb.0: 3237; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3238; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3239; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 3240; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3241; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3242; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3243; GFX11-NEXT: s_waitcnt vmcnt(0) 3244; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 3245; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 3246; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3247; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] 3248; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3249; GFX11-NEXT: s_endpgm 3250; 3251; GFX12-LABEL: v_clamp_v2f16_not_one: 3252; GFX12: ; %bb.0: 3253; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3254; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3255; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 3256; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3257; GFX12-NEXT: s_wait_kmcnt 0x0 3258; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3259; GFX12-NEXT: s_wait_loadcnt 0x0 3260; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 3261; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 3262; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3263; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] 3264; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3265; GFX12-NEXT: s_endpgm 3266 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3267 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3268 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3269 %a = load <2 x half>, ptr addrspace(1) %gep0 3270 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>) 3271 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>) 3272 3273 store <2 x half> %med, ptr addrspace(1) %out.gep 3274 ret void 3275} 3276 3277define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3278; GFX6-LABEL: v_clamp_neg_v2f16: 3279; GFX6: ; %bb.0: 3280; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3281; GFX6-NEXT: s_mov_b32 s7, 0xf000 3282; GFX6-NEXT: s_mov_b32 s6, 0 3283; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3284; GFX6-NEXT: v_mov_b32_e32 v1, 0 3285; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3286; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3287; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3288; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3289; GFX6-NEXT: s_waitcnt vmcnt(0) 3290; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 3291; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3292; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3293; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 3294; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3295; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3296; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3297; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3298; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3299; GFX6-NEXT: s_endpgm 3300; 3301; GFX8-LABEL: v_clamp_neg_v2f16: 3302; GFX8: ; %bb.0: 3303; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3304; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3306; GFX8-NEXT: v_mov_b32_e32 v1, s3 3307; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3308; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3309; GFX8-NEXT: flat_load_dword v3, v[0:1] 3310; GFX8-NEXT: v_mov_b32_e32 v1, s1 3311; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3312; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3313; GFX8-NEXT: s_waitcnt vmcnt(0) 3314; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3315; GFX8-NEXT: v_max_f16_e64 v3, -v3, -v3 clamp 3316; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3317; GFX8-NEXT: flat_store_dword v[0:1], v2 3318; GFX8-NEXT: s_endpgm 3319; 3320; GFX9-LABEL: v_clamp_neg_v2f16: 3321; GFX9: ; %bb.0: 3322; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3323; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3325; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3326; GFX9-NEXT: s_waitcnt vmcnt(0) 3327; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 3328; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3329; GFX9-NEXT: s_endpgm 3330; 3331; GFX11-LABEL: v_clamp_neg_v2f16: 3332; GFX11: ; %bb.0: 3333; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3334; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3335; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3336; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3337; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3338; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3339; GFX11-NEXT: s_waitcnt vmcnt(0) 3340; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 3341; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3342; GFX11-NEXT: s_endpgm 3343; 3344; GFX12-LABEL: v_clamp_neg_v2f16: 3345; GFX12: ; %bb.0: 3346; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3347; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3348; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3349; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3350; GFX12-NEXT: s_wait_kmcnt 0x0 3351; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3352; GFX12-NEXT: s_wait_loadcnt 0x0 3353; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 3354; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3355; GFX12-NEXT: s_endpgm 3356 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3357 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3358 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3359 %a = load <2 x half>, ptr addrspace(1) %gep0 3360 %fneg.a = fneg <2 x half> %a 3361 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer) 3362 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3363 3364 store <2 x half> %med, ptr addrspace(1) %out.gep 3365 ret void 3366} 3367 3368define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3369; GFX6-LABEL: v_clamp_negabs_v2f16: 3370; GFX6: ; %bb.0: 3371; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3372; GFX6-NEXT: s_mov_b32 s7, 0xf000 3373; GFX6-NEXT: s_mov_b32 s6, 0 3374; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3375; GFX6-NEXT: v_mov_b32_e32 v1, 0 3376; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3377; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3378; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3379; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3380; GFX6-NEXT: s_waitcnt vmcnt(0) 3381; GFX6-NEXT: v_or_b32_e32 v2, 0x80008000, v2 3382; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3383; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3384; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 3385; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3386; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3387; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3388; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3389; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3390; GFX6-NEXT: s_endpgm 3391; 3392; GFX8-LABEL: v_clamp_negabs_v2f16: 3393; GFX8: ; %bb.0: 3394; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3395; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3397; GFX8-NEXT: v_mov_b32_e32 v1, s3 3398; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3399; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3400; GFX8-NEXT: flat_load_dword v3, v[0:1] 3401; GFX8-NEXT: v_mov_b32_e32 v1, s1 3402; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3403; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3404; GFX8-NEXT: s_waitcnt vmcnt(0) 3405; GFX8-NEXT: v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3406; GFX8-NEXT: v_max_f16_e64 v3, -|v3|, -|v3| clamp 3407; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3408; GFX8-NEXT: flat_store_dword v[0:1], v2 3409; GFX8-NEXT: s_endpgm 3410; 3411; GFX9-LABEL: v_clamp_negabs_v2f16: 3412; GFX9: ; %bb.0: 3413; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3414; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3416; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3417; GFX9-NEXT: s_waitcnt vmcnt(0) 3418; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 3419; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 3420; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3421; GFX9-NEXT: s_endpgm 3422; 3423; GFX11-LABEL: v_clamp_negabs_v2f16: 3424; GFX11: ; %bb.0: 3425; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3426; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3427; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 3428; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3429; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3430; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3431; GFX11-NEXT: s_waitcnt vmcnt(0) 3432; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 3433; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 3434; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3435; GFX11-NEXT: s_endpgm 3436; 3437; GFX12-LABEL: v_clamp_negabs_v2f16: 3438; GFX12: ; %bb.0: 3439; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3440; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3441; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 3442; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3443; GFX12-NEXT: s_wait_kmcnt 0x0 3444; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3445; GFX12-NEXT: s_wait_loadcnt 0x0 3446; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 3447; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 3448; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3449; GFX12-NEXT: s_endpgm 3450 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3451 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3452 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3453 %a = load <2 x half>, ptr addrspace(1) %gep0 3454 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) 3455 %fneg.fabs.a = fneg <2 x half> %fabs.a 3456 3457 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer) 3458 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3459 3460 store <2 x half> %med, ptr addrspace(1) %out.gep 3461 ret void 3462} 3463 3464define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3465; GFX6-LABEL: v_clamp_neglo_v2f16: 3466; GFX6: ; %bb.0: 3467; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3468; GFX6-NEXT: s_mov_b32 s7, 0xf000 3469; GFX6-NEXT: s_mov_b32 s6, 0 3470; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3471; GFX6-NEXT: v_mov_b32_e32 v1, 0 3472; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3473; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3474; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3475; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3476; GFX6-NEXT: s_waitcnt vmcnt(0) 3477; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3478; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3479; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 3480; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp 3481; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3482; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3483; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3484; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3485; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3486; GFX6-NEXT: s_endpgm 3487; 3488; GFX8-LABEL: v_clamp_neglo_v2f16: 3489; GFX8: ; %bb.0: 3490; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3491; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3492; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3493; GFX8-NEXT: v_mov_b32_e32 v1, s3 3494; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3495; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3496; GFX8-NEXT: flat_load_dword v3, v[0:1] 3497; GFX8-NEXT: v_mov_b32_e32 v1, s1 3498; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3499; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3500; GFX8-NEXT: s_waitcnt vmcnt(0) 3501; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3502; GFX8-NEXT: v_max_f16_e64 v3, -v3, -v3 clamp 3503; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3504; GFX8-NEXT: flat_store_dword v[0:1], v2 3505; GFX8-NEXT: s_endpgm 3506; 3507; GFX9-LABEL: v_clamp_neglo_v2f16: 3508; GFX9: ; %bb.0: 3509; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3510; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3511; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3512; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3513; GFX9-NEXT: s_waitcnt vmcnt(0) 3514; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp 3515; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3516; GFX9-NEXT: s_endpgm 3517; 3518; GFX11-LABEL: v_clamp_neglo_v2f16: 3519; GFX11: ; %bb.0: 3520; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3521; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3522; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3523; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3524; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3525; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3526; GFX11-NEXT: s_waitcnt vmcnt(0) 3527; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp 3528; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3529; GFX11-NEXT: s_endpgm 3530; 3531; GFX12-LABEL: v_clamp_neglo_v2f16: 3532; GFX12: ; %bb.0: 3533; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3534; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3535; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3536; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3537; GFX12-NEXT: s_wait_kmcnt 0x0 3538; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3539; GFX12-NEXT: s_wait_loadcnt 0x0 3540; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp 3541; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3542; GFX12-NEXT: s_endpgm 3543 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3544 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3545 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3546 %a = load <2 x half>, ptr addrspace(1) %gep0 3547 %lo = extractelement <2 x half> %a, i32 0 3548 %neg.lo = fneg half %lo 3549 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0 3550 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer) 3551 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3552 3553 store <2 x half> %med, ptr addrspace(1) %out.gep 3554 ret void 3555} 3556 3557define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3558; GFX6-LABEL: v_clamp_neghi_v2f16: 3559; GFX6: ; %bb.0: 3560; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3561; GFX6-NEXT: s_mov_b32 s7, 0xf000 3562; GFX6-NEXT: s_mov_b32 s6, 0 3563; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3564; GFX6-NEXT: v_mov_b32_e32 v1, 0 3565; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3566; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3567; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3568; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3569; GFX6-NEXT: s_waitcnt vmcnt(0) 3570; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3571; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 clamp 3572; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 3573; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3574; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3575; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3576; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3577; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3578; GFX6-NEXT: s_endpgm 3579; 3580; GFX8-LABEL: v_clamp_neghi_v2f16: 3581; GFX8: ; %bb.0: 3582; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3583; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3585; GFX8-NEXT: v_mov_b32_e32 v1, s3 3586; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3587; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3588; GFX8-NEXT: flat_load_dword v3, v[0:1] 3589; GFX8-NEXT: v_mov_b32_e32 v1, s1 3590; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3591; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3592; GFX8-NEXT: s_waitcnt vmcnt(0) 3593; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3594; GFX8-NEXT: v_max_f16_e64 v3, v3, v3 clamp 3595; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3596; GFX8-NEXT: flat_store_dword v[0:1], v2 3597; GFX8-NEXT: s_endpgm 3598; 3599; GFX9-LABEL: v_clamp_neghi_v2f16: 3600; GFX9: ; %bb.0: 3601; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3602; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3604; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3605; GFX9-NEXT: s_waitcnt vmcnt(0) 3606; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp 3607; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3608; GFX9-NEXT: s_endpgm 3609; 3610; GFX11-LABEL: v_clamp_neghi_v2f16: 3611; GFX11: ; %bb.0: 3612; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3613; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3614; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3615; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3616; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3617; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3618; GFX11-NEXT: s_waitcnt vmcnt(0) 3619; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp 3620; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3621; GFX11-NEXT: s_endpgm 3622; 3623; GFX12-LABEL: v_clamp_neghi_v2f16: 3624; GFX12: ; %bb.0: 3625; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3626; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3627; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3628; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3629; GFX12-NEXT: s_wait_kmcnt 0x0 3630; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3631; GFX12-NEXT: s_wait_loadcnt 0x0 3632; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp 3633; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3634; GFX12-NEXT: s_endpgm 3635 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3636 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3637 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3638 %a = load <2 x half>, ptr addrspace(1) %gep0 3639 %hi = extractelement <2 x half> %a, i32 1 3640 %neg.hi = fneg half %hi 3641 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1 3642 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer) 3643 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3644 3645 store <2 x half> %med, ptr addrspace(1) %out.gep 3646 ret void 3647} 3648 3649define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3650; GFX6-LABEL: v_clamp_v2f16_shuffle: 3651; GFX6: ; %bb.0: 3652; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3653; GFX6-NEXT: s_mov_b32 s7, 0xf000 3654; GFX6-NEXT: s_mov_b32 s6, 0 3655; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3656; GFX6-NEXT: v_mov_b32_e32 v1, 0 3657; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3658; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3659; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3660; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3661; GFX6-NEXT: s_waitcnt vmcnt(0) 3662; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3663; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 3664; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3665; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3666; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3667; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3668; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 3669; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3670; GFX6-NEXT: s_endpgm 3671; 3672; GFX8-LABEL: v_clamp_v2f16_shuffle: 3673; GFX8: ; %bb.0: 3674; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3675; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3676; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3677; GFX8-NEXT: v_mov_b32_e32 v1, s3 3678; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3679; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3680; GFX8-NEXT: flat_load_dword v3, v[0:1] 3681; GFX8-NEXT: v_mov_b32_e32 v1, s1 3682; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3683; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3684; GFX8-NEXT: s_waitcnt vmcnt(0) 3685; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3686; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3687; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3688; GFX8-NEXT: flat_store_dword v[0:1], v2 3689; GFX8-NEXT: s_endpgm 3690; 3691; GFX9-LABEL: v_clamp_v2f16_shuffle: 3692; GFX9: ; %bb.0: 3693; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3694; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3695; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3696; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3697; GFX9-NEXT: s_waitcnt vmcnt(0) 3698; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 3699; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3700; GFX9-NEXT: s_endpgm 3701; 3702; GFX11-LABEL: v_clamp_v2f16_shuffle: 3703; GFX11: ; %bb.0: 3704; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3705; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3706; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3707; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3708; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3709; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3710; GFX11-NEXT: s_waitcnt vmcnt(0) 3711; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 3712; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3713; GFX11-NEXT: s_endpgm 3714; 3715; GFX12-LABEL: v_clamp_v2f16_shuffle: 3716; GFX12: ; %bb.0: 3717; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3718; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3719; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3720; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3721; GFX12-NEXT: s_wait_kmcnt 0x0 3722; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3723; GFX12-NEXT: s_wait_loadcnt 0x0 3724; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 3725; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3726; GFX12-NEXT: s_endpgm 3727 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3728 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3729 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3730 %a = load <2 x half>, ptr addrspace(1) %gep0 3731 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0> 3732 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer) 3733 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3734 3735 store <2 x half> %med, ptr addrspace(1) %out.gep 3736 ret void 3737} 3738 3739define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3740; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0: 3741; GFX6: ; %bb.0: 3742; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3743; GFX6-NEXT: s_mov_b32 s7, 0xf000 3744; GFX6-NEXT: s_mov_b32 s6, 0 3745; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3746; GFX6-NEXT: v_mov_b32_e32 v1, 0 3747; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3748; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3749; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3750; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 3751; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3752; GFX6-NEXT: s_waitcnt vmcnt(0) 3753; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3754; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 3755; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 3756; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 3757; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 3758; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3759; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 3760; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3761; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3762; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3763; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3764; GFX6-NEXT: s_endpgm 3765; 3766; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0: 3767; GFX8: ; %bb.0: 3768; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3769; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3770; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 3771; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3772; GFX8-NEXT: v_mov_b32_e32 v1, s3 3773; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3774; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3775; GFX8-NEXT: flat_load_dword v3, v[0:1] 3776; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3777; GFX8-NEXT: v_mov_b32_e32 v1, s1 3778; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3779; GFX8-NEXT: s_waitcnt vmcnt(0) 3780; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3781; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 3782; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2 3783; GFX8-NEXT: v_max_f16_e32 v3, 0, v3 3784; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3 3785; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3786; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3787; GFX8-NEXT: flat_store_dword v[0:1], v2 3788; GFX8-NEXT: s_endpgm 3789; 3790; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: 3791; GFX9: ; %bb.0: 3792; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3793; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3794; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3795; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3796; GFX9-NEXT: s_waitcnt vmcnt(0) 3797; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3798; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3799; GFX9-NEXT: s_endpgm 3800; 3801; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: 3802; GFX11: ; %bb.0: 3803; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3804; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3806; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3807; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3808; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3809; GFX11-NEXT: s_waitcnt vmcnt(0) 3810; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3811; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3812; GFX11-NEXT: s_endpgm 3813; 3814; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0: 3815; GFX12: ; %bb.0: 3816; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3817; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3818; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3819; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3820; GFX12-NEXT: s_wait_kmcnt 0x0 3821; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3822; GFX12-NEXT: s_wait_loadcnt 0x0 3823; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp 3824; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3825; GFX12-NEXT: s_endpgm 3826 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3827 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3828 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3829 %a = load <2 x half>, ptr addrspace(1) %gep0 3830 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>) 3831 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>) 3832 3833 store <2 x half> %med, ptr addrspace(1) %out.gep 3834 ret void 3835} 3836 3837define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 3838; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1: 3839; GFX6: ; %bb.0: 3840; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3841; GFX6-NEXT: s_mov_b32 s7, 0xf000 3842; GFX6-NEXT: s_mov_b32 s6, 0 3843; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3844; GFX6-NEXT: v_mov_b32_e32 v1, 0 3845; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3846; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3847; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3848; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 3849; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3850; GFX6-NEXT: s_waitcnt vmcnt(0) 3851; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3852; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 3853; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 3854; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 3855; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 3856; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3857; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 3858; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3859; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3860; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3861; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3862; GFX6-NEXT: s_endpgm 3863; 3864; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1: 3865; GFX8: ; %bb.0: 3866; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3867; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3868; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 3869; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3870; GFX8-NEXT: v_mov_b32_e32 v1, s3 3871; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3872; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3873; GFX8-NEXT: flat_load_dword v3, v[0:1] 3874; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3875; GFX8-NEXT: v_mov_b32_e32 v1, s1 3876; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3877; GFX8-NEXT: s_waitcnt vmcnt(0) 3878; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3879; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 3880; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 3881; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3 3882; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3 3883; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3884; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3885; GFX8-NEXT: flat_store_dword v[0:1], v2 3886; GFX8-NEXT: s_endpgm 3887; 3888; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: 3889; GFX9: ; %bb.0: 3890; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3891; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3892; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3893; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3894; GFX9-NEXT: s_waitcnt vmcnt(0) 3895; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3896; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3897; GFX9-NEXT: s_endpgm 3898; 3899; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: 3900; GFX11: ; %bb.0: 3901; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3902; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3903; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3904; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3905; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3906; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3907; GFX11-NEXT: s_waitcnt vmcnt(0) 3908; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3909; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3910; GFX11-NEXT: s_endpgm 3911; 3912; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1: 3913; GFX12: ; %bb.0: 3914; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3915; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3916; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3917; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3918; GFX12-NEXT: s_wait_kmcnt 0x0 3919; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 3920; GFX12-NEXT: s_wait_loadcnt 0x0 3921; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp 3922; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 3923; GFX12-NEXT: s_endpgm 3924 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3925 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 3926 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 3927 %a = load <2 x half>, ptr addrspace(1) %gep0 3928 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 3929 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 3930 3931 store <2 x half> %med, ptr addrspace(1) %out.gep 3932 ret void 3933} 3934 3935define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 3936; GFX6-LABEL: v_clamp_diff_source_f32: 3937; GFX6: ; %bb.0: 3938; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3939; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3940; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 3941; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2 3942; GFX6-NEXT: s_mov_b32 s3, 0xf000 3943; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3944; GFX6-NEXT: v_mov_b32_e32 v0, s5 3945; GFX6-NEXT: v_mov_b32_e32 v1, s2 3946; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 3947; GFX6-NEXT: v_add_f32_e32 v1, s4, v1 3948; GFX6-NEXT: v_max_f32_e64 v0, v0, v1 clamp 3949; GFX6-NEXT: s_mov_b32 s2, -1 3950; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 3951; GFX6-NEXT: s_endpgm 3952; 3953; GFX8-LABEL: v_clamp_diff_source_f32: 3954; GFX8: ; %bb.0: 3955; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3956; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3957; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 3958; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8 3959; GFX8-NEXT: s_add_u32 s0, s0, 12 3960; GFX8-NEXT: s_addc_u32 s1, s1, 0 3961; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3962; GFX8-NEXT: v_mov_b32_e32 v0, s5 3963; GFX8-NEXT: v_mov_b32_e32 v1, s2 3964; GFX8-NEXT: v_add_f32_e32 v0, s4, v0 3965; GFX8-NEXT: v_add_f32_e32 v1, s4, v1 3966; GFX8-NEXT: v_max_f32_e64 v2, v0, v1 clamp 3967; GFX8-NEXT: v_mov_b32_e32 v0, s0 3968; GFX8-NEXT: v_mov_b32_e32 v1, s1 3969; GFX8-NEXT: flat_store_dword v[0:1], v2 3970; GFX8-NEXT: s_endpgm 3971; 3972; GFX9-LABEL: v_clamp_diff_source_f32: 3973; GFX9: ; %bb.0: 3974; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3975; GFX9-NEXT: v_mov_b32_e32 v0, 0 3976; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3977; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 3978; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8 3979; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3980; GFX9-NEXT: v_mov_b32_e32 v1, s5 3981; GFX9-NEXT: v_mov_b32_e32 v2, s6 3982; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 3983; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 3984; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp 3985; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12 3986; GFX9-NEXT: s_endpgm 3987; 3988; GFX11-LABEL: v_clamp_diff_source_f32: 3989; GFX11: ; %bb.0: 3990; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3991; GFX11-NEXT: v_mov_b32_e32 v2, 0 3992; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3993; GFX11-NEXT: s_clause 0x1 3994; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 3995; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8 3996; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3997; GFX11-NEXT: v_add_f32_e64 v0, s4, s5 3998; GFX11-NEXT: v_add_f32_e64 v1, s4, s2 3999; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4000; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp 4001; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12 4002; GFX11-NEXT: s_endpgm 4003; 4004; GFX12-LABEL: v_clamp_diff_source_f32: 4005; GFX12: ; %bb.0: 4006; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4007; GFX12-NEXT: v_mov_b32_e32 v0, 0 4008; GFX12-NEXT: s_wait_kmcnt 0x0 4009; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 4010; GFX12-NEXT: s_wait_kmcnt 0x0 4011; GFX12-NEXT: s_add_f32 s2, s4, s5 4012; GFX12-NEXT: s_add_f32 s3, s4, s6 4013; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) 4014; GFX12-NEXT: s_max_num_f32 s2, s2, s3 4015; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp 4016; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12 4017; GFX12-NEXT: s_endpgm 4018{ 4019 %gep1 = getelementptr float, ptr addrspace(1) %aptr, i32 1 4020 %gep2 = getelementptr float, ptr addrspace(1) %aptr, i32 2 4021 %l0 = load float, ptr addrspace(1) %aptr 4022 %l1 = load float, ptr addrspace(1) %gep1 4023 %l2 = load float, ptr addrspace(1) %gep2 4024 %a = fadd nsz float %l0, %l1 4025 %b = fadd nsz float %l0, %l2 4026 %res = call nsz float @llvm.maxnum.f32(float %a, float %b) 4027 %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0) 4028 %min = call nsz float @llvm.minnum.f32(float %max, float 1.0) 4029 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 3 4030 store float %min, ptr addrspace(1) %out.gep 4031 ret void 4032} 4033 4034declare i32 @llvm.amdgcn.workitem.id.x() #1 4035declare float @llvm.fabs.f32(float) #1 4036declare float @llvm.minnum.f32(float, float) #1 4037declare float @llvm.maxnum.f32(float, float) #1 4038declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 4039declare double @llvm.fabs.f64(double) #1 4040declare double @llvm.minnum.f64(double, double) #1 4041declare double @llvm.maxnum.f64(double, double) #1 4042declare half @llvm.fabs.f16(half) #1 4043declare half @llvm.minnum.f16(half, half) #1 4044declare half @llvm.maxnum.f16(half, half) #1 4045declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 4046declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 4047declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 4048 4049attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 4050attributes #1 = { nounwind readnone } 4051attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 4052attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 4053attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 4054