1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s 6 7define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 8; SI-LABEL: v_clamp_add_src_f32: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 11; SI-NEXT: s_mov_b32 s7, 0xf000 12; SI-NEXT: s_mov_b32 s6, 0 13; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 14; SI-NEXT: v_mov_b32_e32 v1, 0 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 17; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 18; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 19; SI-NEXT: s_waitcnt vmcnt(0) 20; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 21; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 22; SI-NEXT: s_endpgm 23; 24; GFX8-LABEL: v_clamp_add_src_f32: 25; GFX8: ; %bb.0: 26; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 27; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 28; GFX8-NEXT: s_waitcnt lgkmcnt(0) 29; GFX8-NEXT: v_mov_b32_e32 v1, s3 30; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 31; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 32; GFX8-NEXT: flat_load_dword v3, v[0:1] 33; GFX8-NEXT: v_mov_b32_e32 v1, s1 34; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 35; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 36; GFX8-NEXT: s_waitcnt vmcnt(0) 37; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp 38; GFX8-NEXT: flat_store_dword v[0:1], v2 39; GFX8-NEXT: s_endpgm 40; 41; GFX9-LABEL: v_clamp_add_src_f32: 42; GFX9: ; %bb.0: 43; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 44; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 45; GFX9-NEXT: s_waitcnt lgkmcnt(0) 46; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 47; GFX9-NEXT: s_waitcnt vmcnt(0) 48; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 49; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 50; GFX9-NEXT: s_endpgm 51; 52; GFX11-LABEL: v_clamp_add_src_f32: 53; GFX11: ; %bb.0: 54; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 55; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 56; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 57; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 58; GFX11-NEXT: s_waitcnt lgkmcnt(0) 59; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 60; GFX11-NEXT: s_waitcnt vmcnt(0) 61; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 62; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 63; GFX11-NEXT: s_endpgm 64 %tid = call i32 @llvm.amdgcn.workitem.id.x() 65 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 66 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 67 %a = load float, ptr addrspace(1) %gep0 68 %add = fadd float %a, 1.0 69 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 70 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 71 store float %clamp, ptr addrspace(1) %out.gep 72 ret void 73} 74 75define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 76; SI-LABEL: v_clamp_multi_use_src_f32: 77; SI: ; %bb.0: 78; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 79; SI-NEXT: s_mov_b32 s6, 0 80; SI-NEXT: s_mov_b32 s7, 0xf000 81; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 82; SI-NEXT: v_mov_b32_e32 v1, 0 83; SI-NEXT: s_waitcnt lgkmcnt(0) 84; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 85; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 86; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 87; SI-NEXT: s_mov_b32 s6, -1 88; SI-NEXT: s_waitcnt vmcnt(0) 89; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 90; SI-NEXT: v_max_f32_e64 v3, v2, v2 clamp 91; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 92; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 93; SI-NEXT: s_waitcnt vmcnt(0) 94; SI-NEXT: s_endpgm 95; 96; GFX8-LABEL: v_clamp_multi_use_src_f32: 97; GFX8: ; %bb.0: 98; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 99; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 100; GFX8-NEXT: s_waitcnt lgkmcnt(0) 101; GFX8-NEXT: v_mov_b32_e32 v1, s3 102; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 103; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 104; GFX8-NEXT: flat_load_dword v3, v[0:1] 105; GFX8-NEXT: v_mov_b32_e32 v1, s1 106; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 107; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 108; GFX8-NEXT: s_waitcnt vmcnt(0) 109; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3 110; GFX8-NEXT: v_max_f32_e64 v3, v2, v2 clamp 111; GFX8-NEXT: flat_store_dword v[0:1], v3 112; GFX8-NEXT: flat_store_dword v[0:1], v2 113; GFX8-NEXT: s_waitcnt vmcnt(0) 114; GFX8-NEXT: s_endpgm 115; 116; GFX9-LABEL: v_clamp_multi_use_src_f32: 117; GFX9: ; %bb.0: 118; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 119; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 120; GFX9-NEXT: s_waitcnt lgkmcnt(0) 121; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 122; GFX9-NEXT: s_waitcnt vmcnt(0) 123; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 124; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp 125; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 126; GFX9-NEXT: global_store_dword v[0:1], v1, off 127; GFX9-NEXT: s_waitcnt vmcnt(0) 128; GFX9-NEXT: s_endpgm 129; 130; GFX11-LABEL: v_clamp_multi_use_src_f32: 131; GFX11: ; %bb.0: 132; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 133; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 134; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 135; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 136; GFX11-NEXT: s_waitcnt lgkmcnt(0) 137; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 138; GFX11-NEXT: s_waitcnt vmcnt(0) 139; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 140; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp 141; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] 142; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc 143; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 144; GFX11-NEXT: s_endpgm 145 %tid = call i32 @llvm.amdgcn.workitem.id.x() 146 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 147 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 148 %a = load float, ptr addrspace(1) %gep0 149 %add = fadd float %a, 1.0 150 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 151 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 152 store float %clamp, ptr addrspace(1) %out.gep 153 store volatile float %add, ptr addrspace(1) undef 154 ret void 155} 156 157define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 158; SI-LABEL: v_clamp_dbg_use_src_f32: 159; SI: ; %bb.0: 160; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 161; SI-NEXT: s_mov_b32 s7, 0xf000 162; SI-NEXT: s_mov_b32 s6, 0 163; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 164; SI-NEXT: v_mov_b32_e32 v1, 0 165; SI-NEXT: s_waitcnt lgkmcnt(0) 166; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 167; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 168; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 169; SI-NEXT: s_waitcnt vmcnt(0) 170; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 171; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 172; SI-NEXT: s_endpgm 173; 174; GFX8-LABEL: v_clamp_dbg_use_src_f32: 175; GFX8: ; %bb.0: 176; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 177; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 178; GFX8-NEXT: s_waitcnt lgkmcnt(0) 179; GFX8-NEXT: v_mov_b32_e32 v1, s3 180; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 181; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 182; GFX8-NEXT: flat_load_dword v3, v[0:1] 183; GFX8-NEXT: v_mov_b32_e32 v1, s1 184; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 185; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 186; GFX8-NEXT: s_waitcnt vmcnt(0) 187; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp 188; GFX8-NEXT: flat_store_dword v[0:1], v2 189; GFX8-NEXT: s_endpgm 190; 191; GFX9-LABEL: v_clamp_dbg_use_src_f32: 192; GFX9: ; %bb.0: 193; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 194; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 195; GFX9-NEXT: s_waitcnt lgkmcnt(0) 196; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 197; GFX9-NEXT: s_waitcnt vmcnt(0) 198; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 199; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 200; GFX9-NEXT: s_endpgm 201; 202; GFX11-LABEL: v_clamp_dbg_use_src_f32: 203; GFX11: ; %bb.0: 204; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 205; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 206; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 207; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 208; GFX11-NEXT: s_waitcnt lgkmcnt(0) 209; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 210; GFX11-NEXT: s_waitcnt vmcnt(0) 211; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 212; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 213; GFX11-NEXT: s_endpgm 214 %tid = call i32 @llvm.amdgcn.workitem.id.x() 215 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 216 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 217 %a = load float, ptr addrspace(1) %gep0 218 %add = fadd float %a, 1.0 219 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 220 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 221 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 222 store float %clamp, ptr addrspace(1) %out.gep 223 ret void 224} 225 226define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 227; SI-LABEL: v_clamp_add_neg_src_f32: 228; SI: ; %bb.0: 229; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 230; SI-NEXT: s_mov_b32 s7, 0xf000 231; SI-NEXT: s_mov_b32 s6, 0 232; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 233; SI-NEXT: v_mov_b32_e32 v1, 0 234; SI-NEXT: s_waitcnt lgkmcnt(0) 235; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 236; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 237; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 238; SI-NEXT: s_waitcnt vmcnt(0) 239; SI-NEXT: v_floor_f32_e32 v2, v2 240; SI-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp 241; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 242; SI-NEXT: s_endpgm 243; 244; GFX8-LABEL: v_clamp_add_neg_src_f32: 245; GFX8: ; %bb.0: 246; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 247; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 248; GFX8-NEXT: s_waitcnt lgkmcnt(0) 249; GFX8-NEXT: v_mov_b32_e32 v1, s3 250; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 251; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 252; GFX8-NEXT: flat_load_dword v3, v[0:1] 253; GFX8-NEXT: v_mov_b32_e32 v1, s1 254; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 255; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 256; GFX8-NEXT: s_waitcnt vmcnt(0) 257; GFX8-NEXT: v_floor_f32_e32 v2, v3 258; GFX8-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp 259; GFX8-NEXT: flat_store_dword v[0:1], v2 260; GFX8-NEXT: s_endpgm 261; 262; GFX9-LABEL: v_clamp_add_neg_src_f32: 263; GFX9: ; %bb.0: 264; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 265; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 266; GFX9-NEXT: s_waitcnt lgkmcnt(0) 267; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 268; GFX9-NEXT: s_waitcnt vmcnt(0) 269; GFX9-NEXT: v_floor_f32_e32 v1, v1 270; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp 271; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 272; GFX9-NEXT: s_endpgm 273; 274; GFX11-LABEL: v_clamp_add_neg_src_f32: 275; GFX11: ; %bb.0: 276; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 277; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 278; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 279; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 280; GFX11-NEXT: s_waitcnt lgkmcnt(0) 281; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 282; GFX11-NEXT: s_waitcnt vmcnt(0) 283; GFX11-NEXT: v_floor_f32_e32 v1, v1 284; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp 285; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 286; GFX11-NEXT: s_endpgm 287 %tid = call i32 @llvm.amdgcn.workitem.id.x() 288 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 289 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 290 %a = load float, ptr addrspace(1) %gep0 291 %floor = call float @llvm.floor.f32(float %a) 292 %neg.floor = fneg float %floor 293 %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0) 294 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 295 store float %clamp, ptr addrspace(1) %out.gep 296 ret void 297} 298 299define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 300; SI-LABEL: v_non_clamp_max_f32: 301; SI: ; %bb.0: 302; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 303; SI-NEXT: s_mov_b32 s7, 0xf000 304; SI-NEXT: s_mov_b32 s6, 0 305; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 306; SI-NEXT: v_mov_b32_e32 v1, 0 307; SI-NEXT: s_waitcnt lgkmcnt(0) 308; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 309; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 310; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 311; SI-NEXT: s_waitcnt vmcnt(0) 312; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 313; SI-NEXT: v_max_f32_e32 v2, 0, v2 314; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 315; SI-NEXT: s_endpgm 316; 317; GFX8-LABEL: v_non_clamp_max_f32: 318; GFX8: ; %bb.0: 319; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 320; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 321; GFX8-NEXT: s_waitcnt lgkmcnt(0) 322; GFX8-NEXT: v_mov_b32_e32 v1, s3 323; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 324; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 325; GFX8-NEXT: flat_load_dword v3, v[0:1] 326; GFX8-NEXT: v_mov_b32_e32 v1, s1 327; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 328; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 329; GFX8-NEXT: s_waitcnt vmcnt(0) 330; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3 331; GFX8-NEXT: v_max_f32_e32 v2, 0, v2 332; GFX8-NEXT: flat_store_dword v[0:1], v2 333; GFX8-NEXT: s_endpgm 334; 335; GFX9-LABEL: v_non_clamp_max_f32: 336; GFX9: ; %bb.0: 337; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 338; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 339; GFX9-NEXT: s_waitcnt lgkmcnt(0) 340; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 341; GFX9-NEXT: s_waitcnt vmcnt(0) 342; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 343; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 344; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 345; GFX9-NEXT: s_endpgm 346; 347; GFX11-LABEL: v_non_clamp_max_f32: 348; GFX11: ; %bb.0: 349; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 350; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 351; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 352; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 353; GFX11-NEXT: s_waitcnt lgkmcnt(0) 354; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 355; GFX11-NEXT: s_waitcnt vmcnt(0) 356; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 357; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 358; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 359; GFX11-NEXT: s_endpgm 360 %tid = call i32 @llvm.amdgcn.workitem.id.x() 361 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 362 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 363 %a = load float, ptr addrspace(1) %gep0 364 %add = fadd float %a, 1.0 365 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 366 store float %max, ptr addrspace(1) %out.gep 367 ret void 368} 369 370define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { 371; SI-LABEL: v_clamp_add_src_f32_denormals: 372; SI: ; %bb.0: 373; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 374; SI-NEXT: s_mov_b32 s7, 0xf000 375; SI-NEXT: s_mov_b32 s6, 0 376; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 377; SI-NEXT: v_mov_b32_e32 v1, 0 378; SI-NEXT: s_waitcnt lgkmcnt(0) 379; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 380; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 381; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 382; SI-NEXT: s_waitcnt vmcnt(0) 383; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 384; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 385; SI-NEXT: s_endpgm 386; 387; GFX8-LABEL: v_clamp_add_src_f32_denormals: 388; GFX8: ; %bb.0: 389; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 390; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 391; GFX8-NEXT: s_waitcnt lgkmcnt(0) 392; GFX8-NEXT: v_mov_b32_e32 v1, s3 393; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 394; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 395; GFX8-NEXT: flat_load_dword v3, v[0:1] 396; GFX8-NEXT: v_mov_b32_e32 v1, s1 397; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 398; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 399; GFX8-NEXT: s_waitcnt vmcnt(0) 400; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp 401; GFX8-NEXT: flat_store_dword v[0:1], v2 402; GFX8-NEXT: s_endpgm 403; 404; GFX9-LABEL: v_clamp_add_src_f32_denormals: 405; GFX9: ; %bb.0: 406; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 407; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 409; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 410; GFX9-NEXT: s_waitcnt vmcnt(0) 411; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 412; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 413; GFX9-NEXT: s_endpgm 414; 415; GFX11-LABEL: v_clamp_add_src_f32_denormals: 416; GFX11: ; %bb.0: 417; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 418; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 419; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 420; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 421; GFX11-NEXT: s_waitcnt lgkmcnt(0) 422; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 423; GFX11-NEXT: s_waitcnt vmcnt(0) 424; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 425; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 426; GFX11-NEXT: s_endpgm 427 %tid = call i32 @llvm.amdgcn.workitem.id.x() 428 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 429 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 430 %a = load float, ptr addrspace(1) %gep0 431 %add = fadd float %a, 1.0 432 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 433 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 434 store float %clamp, ptr addrspace(1) %out.gep 435 ret void 436} 437 438define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 439; SI-LABEL: v_clamp_add_src_f16_denorm: 440; SI: ; %bb.0: 441; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 442; SI-NEXT: s_mov_b32 s7, 0xf000 443; SI-NEXT: s_mov_b32 s6, 0 444; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 445; SI-NEXT: v_mov_b32_e32 v1, 0 446; SI-NEXT: s_waitcnt lgkmcnt(0) 447; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 448; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 449; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 450; SI-NEXT: s_waitcnt vmcnt(0) 451; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 452; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 453; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 454; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 455; SI-NEXT: s_endpgm 456; 457; GFX8-LABEL: v_clamp_add_src_f16_denorm: 458; GFX8: ; %bb.0: 459; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 460; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 461; GFX8-NEXT: s_waitcnt lgkmcnt(0) 462; GFX8-NEXT: v_mov_b32_e32 v1, s3 463; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 464; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 465; GFX8-NEXT: flat_load_ushort v3, v[0:1] 466; GFX8-NEXT: v_mov_b32_e32 v1, s1 467; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 468; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 469; GFX8-NEXT: s_waitcnt vmcnt(0) 470; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp 471; GFX8-NEXT: flat_store_short v[0:1], v2 472; GFX8-NEXT: s_endpgm 473; 474; GFX9-LABEL: v_clamp_add_src_f16_denorm: 475; GFX9: ; %bb.0: 476; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 477; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 478; GFX9-NEXT: s_waitcnt lgkmcnt(0) 479; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 480; GFX9-NEXT: s_waitcnt vmcnt(0) 481; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp 482; GFX9-NEXT: global_store_short v0, v1, s[0:1] 483; GFX9-NEXT: s_endpgm 484; 485; GFX11-LABEL: v_clamp_add_src_f16_denorm: 486; GFX11: ; %bb.0: 487; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 488; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 489; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 490; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 491; GFX11-NEXT: s_waitcnt lgkmcnt(0) 492; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 493; GFX11-NEXT: s_waitcnt vmcnt(0) 494; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp 495; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 496; GFX11-NEXT: s_endpgm 497 %tid = call i32 @llvm.amdgcn.workitem.id.x() 498 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid 499 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 500 %a = load half, ptr addrspace(1) %gep0 501 %add = fadd half %a, 1.0 502 %max = call half @llvm.maxnum.f16(half %add, half 0.0) 503 %clamp = call half @llvm.minnum.f16(half %max, half 1.0) 504 store half %clamp, ptr addrspace(1) %out.gep 505 ret void 506} 507 508define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { 509; SI-LABEL: v_clamp_add_src_f16_no_denormals: 510; SI: ; %bb.0: 511; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 512; SI-NEXT: s_mov_b32 s7, 0xf000 513; SI-NEXT: s_mov_b32 s6, 0 514; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 515; SI-NEXT: v_mov_b32_e32 v1, 0 516; SI-NEXT: s_waitcnt lgkmcnt(0) 517; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 518; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 519; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 520; SI-NEXT: s_waitcnt vmcnt(0) 521; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 522; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 523; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 524; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 525; SI-NEXT: s_endpgm 526; 527; GFX8-LABEL: v_clamp_add_src_f16_no_denormals: 528; GFX8: ; %bb.0: 529; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 530; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 531; GFX8-NEXT: s_waitcnt lgkmcnt(0) 532; GFX8-NEXT: v_mov_b32_e32 v1, s3 533; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 534; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 535; GFX8-NEXT: flat_load_ushort v3, v[0:1] 536; GFX8-NEXT: v_mov_b32_e32 v1, s1 537; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 538; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 539; GFX8-NEXT: s_waitcnt vmcnt(0) 540; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp 541; GFX8-NEXT: flat_store_short v[0:1], v2 542; GFX8-NEXT: s_endpgm 543; 544; GFX9-LABEL: v_clamp_add_src_f16_no_denormals: 545; GFX9: ; %bb.0: 546; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 547; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 548; GFX9-NEXT: s_waitcnt lgkmcnt(0) 549; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 550; GFX9-NEXT: s_waitcnt vmcnt(0) 551; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp 552; GFX9-NEXT: global_store_short v0, v1, s[0:1] 553; GFX9-NEXT: s_endpgm 554; 555; GFX11-LABEL: v_clamp_add_src_f16_no_denormals: 556; GFX11: ; %bb.0: 557; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 558; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 559; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 560; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 561; GFX11-NEXT: s_waitcnt lgkmcnt(0) 562; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 563; GFX11-NEXT: s_waitcnt vmcnt(0) 564; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp 565; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 566; GFX11-NEXT: s_endpgm 567 %tid = call i32 @llvm.amdgcn.workitem.id.x() 568 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid 569 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 570 %a = load half, ptr addrspace(1) %gep0 571 %add = fadd half %a, 1.0 572 %max = call half @llvm.maxnum.f16(half %add, half 0.0) 573 %clamp = call half @llvm.minnum.f16(half %max, half 1.0) 574 store half %clamp, ptr addrspace(1) %out.gep 575 ret void 576} 577 578define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 579; SI-LABEL: v_clamp_add_src_v2f32: 580; SI: ; %bb.0: 581; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 582; SI-NEXT: s_mov_b32 s7, 0xf000 583; SI-NEXT: s_mov_b32 s6, 0 584; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 585; SI-NEXT: v_mov_b32_e32 v1, 0 586; SI-NEXT: s_waitcnt lgkmcnt(0) 587; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 588; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 589; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 590; SI-NEXT: s_waitcnt vmcnt(0) 591; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 592; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp 593; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 594; SI-NEXT: s_endpgm 595; 596; GFX8-LABEL: v_clamp_add_src_v2f32: 597; GFX8: ; %bb.0: 598; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 599; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 600; GFX8-NEXT: s_waitcnt lgkmcnt(0) 601; GFX8-NEXT: v_mov_b32_e32 v1, s3 602; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 603; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 604; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 605; GFX8-NEXT: v_mov_b32_e32 v3, s1 606; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 607; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 608; GFX8-NEXT: s_waitcnt vmcnt(0) 609; GFX8-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp 610; GFX8-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 611; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 612; GFX8-NEXT: s_endpgm 613; 614; GFX9-LABEL: v_clamp_add_src_v2f32: 615; GFX9: ; %bb.0: 616; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 617; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 619; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 620; GFX9-NEXT: s_waitcnt vmcnt(0) 621; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp 622; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 623; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 624; GFX9-NEXT: s_endpgm 625; 626; GFX11-LABEL: v_clamp_add_src_v2f32: 627; GFX11: ; %bb.0: 628; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 629; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 630; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 631; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 632; GFX11-NEXT: s_waitcnt lgkmcnt(0) 633; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 634; GFX11-NEXT: s_waitcnt vmcnt(0) 635; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp 636; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp 637; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 638; GFX11-NEXT: s_endpgm 639 %tid = call i32 @llvm.amdgcn.workitem.id.x() 640 %gep0 = getelementptr <2 x float>, ptr addrspace(1) %aptr, i32 %tid 641 %out.gep = getelementptr <2 x float>, ptr addrspace(1) %out, i32 %tid 642 %a = load <2 x float>, ptr addrspace(1) %gep0 643 %add = fadd <2 x float> %a, <float 1.0, float 1.0> 644 %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer) 645 %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>) 646 store <2 x float> %clamp, ptr addrspace(1) %out.gep 647 ret void 648} 649 650define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 651; SI-LABEL: v_clamp_add_src_f64: 652; SI: ; %bb.0: 653; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 654; SI-NEXT: s_mov_b32 s7, 0xf000 655; SI-NEXT: s_mov_b32 s6, 0 656; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 657; SI-NEXT: v_mov_b32_e32 v1, 0 658; SI-NEXT: s_waitcnt lgkmcnt(0) 659; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 660; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 661; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 662; SI-NEXT: s_waitcnt vmcnt(0) 663; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 clamp 664; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 665; SI-NEXT: s_endpgm 666; 667; GFX8-LABEL: v_clamp_add_src_f64: 668; GFX8: ; %bb.0: 669; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 670; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 671; GFX8-NEXT: s_waitcnt lgkmcnt(0) 672; GFX8-NEXT: v_mov_b32_e32 v1, s3 673; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 674; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 675; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 676; GFX8-NEXT: v_mov_b32_e32 v3, s1 677; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 678; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 679; GFX8-NEXT: s_waitcnt vmcnt(0) 680; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp 681; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 682; GFX8-NEXT: s_endpgm 683; 684; GFX9-LABEL: v_clamp_add_src_f64: 685; GFX9: ; %bb.0: 686; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 687; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 688; GFX9-NEXT: s_waitcnt lgkmcnt(0) 689; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 690; GFX9-NEXT: s_waitcnt vmcnt(0) 691; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp 692; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 693; GFX9-NEXT: s_endpgm 694; 695; GFX11-LABEL: v_clamp_add_src_f64: 696; GFX11: ; %bb.0: 697; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 698; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 699; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 700; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 701; GFX11-NEXT: s_waitcnt lgkmcnt(0) 702; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 703; GFX11-NEXT: s_waitcnt vmcnt(0) 704; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp 705; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 706; GFX11-NEXT: s_endpgm 707 %tid = call i32 @llvm.amdgcn.workitem.id.x() 708 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid 709 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid 710 %a = load double, ptr addrspace(1) %gep0 711 %add = fadd double %a, 1.0 712 %max = call double @llvm.maxnum.f64(double %add, double 0.0) 713 %clamp = call double @llvm.minnum.f64(double %max, double 1.0) 714 store double %clamp, ptr addrspace(1) %out.gep 715 ret void 716} 717 718define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspace(1) %aptr, float %a) #0 { 719; SI-LABEL: v_clamp_mac_to_mad: 720; SI: ; %bb.0: 721; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 722; SI-NEXT: s_load_dword s8, s[4:5], 0xd 723; SI-NEXT: s_mov_b32 s7, 0xf000 724; SI-NEXT: s_mov_b32 s6, 0 725; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 726; SI-NEXT: s_waitcnt lgkmcnt(0) 727; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 728; SI-NEXT: v_mov_b32_e32 v1, 0 729; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 730; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 731; SI-NEXT: s_waitcnt vmcnt(0) 732; SI-NEXT: v_mad_f32 v3, s8, s8, v2 clamp 733; SI-NEXT: v_add_f32_e32 v2, v3, v2 734; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 735; SI-NEXT: s_endpgm 736; 737; GFX8-LABEL: v_clamp_mac_to_mad: 738; GFX8: ; %bb.0: 739; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 740; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 741; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 742; GFX8-NEXT: s_waitcnt lgkmcnt(0) 743; GFX8-NEXT: v_mov_b32_e32 v1, s3 744; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 745; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 746; GFX8-NEXT: flat_load_dword v3, v[0:1] 747; GFX8-NEXT: v_mov_b32_e32 v1, s1 748; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 749; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 750; GFX8-NEXT: s_waitcnt vmcnt(0) 751; GFX8-NEXT: v_mad_f32 v2, s4, s4, v3 clamp 752; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 753; GFX8-NEXT: flat_store_dword v[0:1], v2 754; GFX8-NEXT: s_endpgm 755; 756; GFX9-LABEL: v_clamp_mac_to_mad: 757; GFX9: ; %bb.0: 758; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 759; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 760; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 762; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 763; GFX9-NEXT: s_waitcnt vmcnt(0) 764; GFX9-NEXT: v_mad_f32 v2, s6, s6, v1 clamp 765; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 766; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 767; GFX9-NEXT: s_endpgm 768; 769; GFX11-LABEL: v_clamp_mac_to_mad: 770; GFX11: ; %bb.0: 771; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 772; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 773; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 774; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 775; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 776; GFX11-NEXT: s_waitcnt lgkmcnt(0) 777; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 778; GFX11-NEXT: v_mul_f32_e64 v2, s4, s4 779; GFX11-NEXT: s_waitcnt vmcnt(0) 780; GFX11-NEXT: v_add_f32_e64 v2, v2, v1 clamp 781; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 782; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 783; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 784; GFX11-NEXT: s_endpgm 785 %tid = call i32 @llvm.amdgcn.workitem.id.x() 786 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid 787 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 788 %b = load float, ptr addrspace(1) %gep0 789 790 %mul = fmul float %a, %a 791 %add = fadd float %mul, %b 792 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 793 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 794 %res = fadd float %clamp, %b 795 store float %res, ptr addrspace(1) %out.gep 796 ret void 797} 798 799define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 800; SI-LABEL: v_clamp_add_src_v2f16_denorm: 801; SI: ; %bb.0: 802; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 803; SI-NEXT: s_mov_b32 s7, 0xf000 804; SI-NEXT: s_mov_b32 s6, 0 805; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 806; SI-NEXT: v_mov_b32_e32 v1, 0 807; SI-NEXT: s_waitcnt lgkmcnt(0) 808; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 809; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 810; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 811; SI-NEXT: s_waitcnt vmcnt(0) 812; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 813; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 814; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 815; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp 816; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 817; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 818; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 819; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 820; SI-NEXT: v_or_b32_e32 v2, v2, v3 821; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 822; SI-NEXT: s_endpgm 823; 824; GFX8-LABEL: v_clamp_add_src_v2f16_denorm: 825; GFX8: ; %bb.0: 826; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 827; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 828; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 829; GFX8-NEXT: s_waitcnt lgkmcnt(0) 830; GFX8-NEXT: v_mov_b32_e32 v1, s3 831; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 832; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 833; GFX8-NEXT: flat_load_dword v3, v[0:1] 834; GFX8-NEXT: v_mov_b32_e32 v1, s1 835; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 836; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 837; GFX8-NEXT: s_waitcnt vmcnt(0) 838; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp 839; GFX8-NEXT: v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 840; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 841; GFX8-NEXT: flat_store_dword v[0:1], v2 842; GFX8-NEXT: s_endpgm 843; 844; GFX9-LABEL: v_clamp_add_src_v2f16_denorm: 845; GFX9: ; %bb.0: 846; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 847; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 848; GFX9-NEXT: s_waitcnt lgkmcnt(0) 849; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 850; GFX9-NEXT: s_waitcnt vmcnt(0) 851; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp 852; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 853; GFX9-NEXT: s_endpgm 854; 855; GFX11-LABEL: v_clamp_add_src_v2f16_denorm: 856; GFX11: ; %bb.0: 857; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 858; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 859; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 860; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 861; GFX11-NEXT: s_waitcnt lgkmcnt(0) 862; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 863; GFX11-NEXT: s_waitcnt vmcnt(0) 864; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp 865; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 866; GFX11-NEXT: s_endpgm 867 %tid = call i32 @llvm.amdgcn.workitem.id.x() 868 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 869 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 870 %a = load <2 x half>, ptr addrspace(1) %gep0 871 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 872 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer) 873 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 874 store <2 x half> %clamp, ptr addrspace(1) %out.gep 875 ret void 876} 877 878define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { 879; SI-LABEL: v_clamp_add_src_v2f16_no_denormals: 880; SI: ; %bb.0: 881; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 882; SI-NEXT: s_mov_b32 s7, 0xf000 883; SI-NEXT: s_mov_b32 s6, 0 884; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 885; SI-NEXT: v_mov_b32_e32 v1, 0 886; SI-NEXT: s_waitcnt lgkmcnt(0) 887; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 888; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 889; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 890; SI-NEXT: s_waitcnt vmcnt(0) 891; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 892; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 893; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 894; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp 895; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 896; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 897; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 898; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 899; SI-NEXT: v_or_b32_e32 v2, v2, v3 900; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 901; SI-NEXT: s_endpgm 902; 903; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals: 904; GFX8: ; %bb.0: 905; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 906; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 907; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 908; GFX8-NEXT: s_waitcnt lgkmcnt(0) 909; GFX8-NEXT: v_mov_b32_e32 v1, s3 910; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 911; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 912; GFX8-NEXT: flat_load_dword v3, v[0:1] 913; GFX8-NEXT: v_mov_b32_e32 v1, s1 914; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 915; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 916; GFX8-NEXT: s_waitcnt vmcnt(0) 917; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp 918; GFX8-NEXT: v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 919; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 920; GFX8-NEXT: flat_store_dword v[0:1], v2 921; GFX8-NEXT: s_endpgm 922; 923; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals: 924; GFX9: ; %bb.0: 925; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 926; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 928; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 929; GFX9-NEXT: s_waitcnt vmcnt(0) 930; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp 931; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 932; GFX9-NEXT: s_endpgm 933; 934; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals: 935; GFX11: ; %bb.0: 936; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 937; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 938; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 939; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 940; GFX11-NEXT: s_waitcnt lgkmcnt(0) 941; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 942; GFX11-NEXT: s_waitcnt vmcnt(0) 943; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp 944; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 945; GFX11-NEXT: s_endpgm 946 %tid = call i32 @llvm.amdgcn.workitem.id.x() 947 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 948 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 949 %a = load <2 x half>, ptr addrspace(1) %gep0 950 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 951 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer) 952 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 953 store <2 x half> %clamp, ptr addrspace(1) %out.gep 954 ret void 955} 956 957define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 958; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg: 959; SI: ; %bb.0: 960; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 961; SI-NEXT: s_mov_b32 s7, 0xf000 962; SI-NEXT: s_mov_b32 s6, 0 963; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 964; SI-NEXT: v_mov_b32_e32 v1, 0 965; SI-NEXT: s_waitcnt lgkmcnt(0) 966; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 967; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 968; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 969; SI-NEXT: s_waitcnt vmcnt(0) 970; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 971; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 972; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 973; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 974; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 975; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 976; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 977; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 978; SI-NEXT: v_or_b32_e32 v2, v2, v3 979; SI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 980; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 981; SI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 982; SI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 983; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 984; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 985; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 986; SI-NEXT: v_or_b32_e32 v2, v2, v3 987; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 988; SI-NEXT: s_endpgm 989; 990; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg: 991; GFX8: ; %bb.0: 992; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 993; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 994; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 995; GFX8-NEXT: s_waitcnt lgkmcnt(0) 996; GFX8-NEXT: v_mov_b32_e32 v1, s3 997; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 998; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 999; GFX8-NEXT: flat_load_dword v3, v[0:1] 1000; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1001; GFX8-NEXT: v_mov_b32_e32 v1, s1 1002; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1003; GFX8-NEXT: s_waitcnt vmcnt(0) 1004; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3 1005; GFX8-NEXT: v_add_f16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1006; GFX8-NEXT: v_max_f16_sdwa v3, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1007; GFX8-NEXT: v_max_f16_e64 v2, -v2, -v2 clamp 1008; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 1009; GFX8-NEXT: flat_store_dword v[0:1], v2 1010; GFX8-NEXT: s_endpgm 1011; 1012; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg: 1013; GFX9: ; %bb.0: 1014; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1015; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1016; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1018; GFX9-NEXT: s_waitcnt vmcnt(0) 1019; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1020; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 1021; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1022; GFX9-NEXT: s_endpgm 1023; 1024; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg: 1025; GFX11: ; %bb.0: 1026; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1027; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1028; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1029; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1030; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1032; GFX11-NEXT: s_waitcnt vmcnt(0) 1033; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1034; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 1035; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1036; GFX11-NEXT: s_endpgm 1037 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1038 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 1039 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1040 %a = load <2 x half>, ptr addrspace(1) %gep0 1041 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 1042 %neg.add = fsub <2 x half> <half -0.0, half -0.0>, %add 1043 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.add, <2 x half> zeroinitializer) 1044 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1045 store <2 x half> %clamp, ptr addrspace(1) %out.gep 1046 ret void 1047} 1048 1049define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1050; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: 1051; SI: ; %bb.0: 1052; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1053; SI-NEXT: s_mov_b32 s7, 0xf000 1054; SI-NEXT: s_mov_b32 s6, 0 1055; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1056; SI-NEXT: v_mov_b32_e32 v1, 0 1057; SI-NEXT: s_waitcnt lgkmcnt(0) 1058; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1059; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1060; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1061; SI-NEXT: s_waitcnt vmcnt(0) 1062; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1063; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1064; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1065; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 1066; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp 1067; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1068; SI-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp 1069; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1070; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1071; SI-NEXT: v_or_b32_e32 v2, v2, v3 1072; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1073; SI-NEXT: s_endpgm 1074; 1075; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: 1076; GFX8: ; %bb.0: 1077; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1078; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1079; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 1080; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX8-NEXT: v_mov_b32_e32 v1, s3 1082; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1083; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1084; GFX8-NEXT: flat_load_dword v3, v[0:1] 1085; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1086; GFX8-NEXT: v_mov_b32_e32 v1, s1 1087; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1088; GFX8-NEXT: s_waitcnt vmcnt(0) 1089; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3 1090; GFX8-NEXT: v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1091; GFX8-NEXT: v_max_f16_e64 v2, -v2, -v2 clamp 1092; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 1093; GFX8-NEXT: flat_store_dword v[0:1], v2 1094; GFX8-NEXT: s_endpgm 1095; 1096; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: 1097; GFX9: ; %bb.0: 1098; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1099; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1100; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1102; GFX9-NEXT: s_waitcnt vmcnt(0) 1103; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1104; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp 1105; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1106; GFX9-NEXT: s_endpgm 1107; 1108; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: 1109; GFX11: ; %bb.0: 1110; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1111; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1113; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1114; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1115; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1116; GFX11-NEXT: s_waitcnt vmcnt(0) 1117; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1118; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp 1119; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1120; GFX11-NEXT: s_endpgm 1121 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1122 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 1123 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1124 %a = load <2 x half>, ptr addrspace(1) %gep0 1125 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 1126 %lo = extractelement <2 x half> %add, i32 0 1127 %neg.lo = fsub half -0.0, %lo 1128 %neg.lo.add = insertelement <2 x half> %add, half %neg.lo, i32 0 1129 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.add, <2 x half> zeroinitializer) 1130 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1131 store <2 x half> %clamp, ptr addrspace(1) %out.gep 1132 ret void 1133} 1134 1135define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1136; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: 1137; SI: ; %bb.0: 1138; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1139; SI-NEXT: s_mov_b32 s7, 0xf000 1140; SI-NEXT: s_mov_b32 s6, 0 1141; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1142; SI-NEXT: v_mov_b32_e32 v1, 0 1143; SI-NEXT: s_waitcnt lgkmcnt(0) 1144; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1145; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1146; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1147; SI-NEXT: s_waitcnt vmcnt(0) 1148; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1149; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1150; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1151; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 1152; SI-NEXT: v_max_f32_e64 v3, -v3, -v3 clamp 1153; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 1154; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1155; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1156; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1157; SI-NEXT: v_or_b32_e32 v2, v2, v3 1158; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1159; SI-NEXT: s_endpgm 1160; 1161; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: 1162; GFX8: ; %bb.0: 1163; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1164; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1165; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 1166; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX8-NEXT: v_mov_b32_e32 v1, s3 1168; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1169; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1170; GFX8-NEXT: flat_load_dword v3, v[0:1] 1171; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1172; GFX8-NEXT: v_mov_b32_e32 v1, s1 1173; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1174; GFX8-NEXT: s_waitcnt vmcnt(0) 1175; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp 1176; GFX8-NEXT: v_add_f16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1177; GFX8-NEXT: v_max_f16_sdwa v3, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1178; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 1179; GFX8-NEXT: flat_store_dword v[0:1], v2 1180; GFX8-NEXT: s_endpgm 1181; 1182; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: 1183; GFX9: ; %bb.0: 1184; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1185; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1188; GFX9-NEXT: s_waitcnt vmcnt(0) 1189; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1190; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp 1191; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1192; GFX9-NEXT: s_endpgm 1193; 1194; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: 1195; GFX11: ; %bb.0: 1196; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1197; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1198; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1199; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1200; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1201; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1202; GFX11-NEXT: s_waitcnt vmcnt(0) 1203; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1204; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp 1205; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1206; GFX11-NEXT: s_endpgm 1207 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1208 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 1209 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1210 %a = load <2 x half>, ptr addrspace(1) %gep0 1211 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 1212 %hi = extractelement <2 x half> %add, i32 1 1213 %neg.hi = fsub half -0.0, %hi 1214 %neg.hi.add = insertelement <2 x half> %add, half %neg.hi, i32 1 1215 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.add, <2 x half> zeroinitializer) 1216 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1217 store <2 x half> %clamp, ptr addrspace(1) %out.gep 1218 ret void 1219} 1220 1221define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1222; SI-LABEL: v_clamp_add_src_v2f16_denorm_shuf: 1223; SI: ; %bb.0: 1224; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1225; SI-NEXT: s_mov_b32 s7, 0xf000 1226; SI-NEXT: s_mov_b32 s6, 0 1227; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1228; SI-NEXT: v_mov_b32_e32 v1, 0 1229; SI-NEXT: s_waitcnt lgkmcnt(0) 1230; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1231; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1232; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1233; SI-NEXT: s_waitcnt vmcnt(0) 1234; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 1235; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1236; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1237; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp 1238; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1239; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp 1240; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1241; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1242; SI-NEXT: v_or_b32_e32 v2, v2, v3 1243; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1244; SI-NEXT: s_endpgm 1245; 1246; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf: 1247; GFX8: ; %bb.0: 1248; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1249; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1250; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 1251; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX8-NEXT: v_mov_b32_e32 v1, s3 1253; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1254; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1255; GFX8-NEXT: flat_load_dword v3, v[0:1] 1256; GFX8-NEXT: v_mov_b32_e32 v1, s1 1257; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1258; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1259; GFX8-NEXT: s_waitcnt vmcnt(0) 1260; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1261; GFX8-NEXT: v_add_f16_sdwa v3, v3, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1262; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 1263; GFX8-NEXT: flat_store_dword v[0:1], v2 1264; GFX8-NEXT: s_endpgm 1265; 1266; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf: 1267; GFX9: ; %bb.0: 1268; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1269; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1270; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1272; GFX9-NEXT: s_waitcnt vmcnt(0) 1273; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1274; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 1275; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1276; GFX9-NEXT: s_endpgm 1277; 1278; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf: 1279; GFX11: ; %bb.0: 1280; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1281; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1282; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1283; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1284; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1285; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1286; GFX11-NEXT: s_waitcnt vmcnt(0) 1287; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1288; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 1289; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1290; GFX11-NEXT: s_endpgm 1291 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1292 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 1293 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1294 %a = load <2 x half>, ptr addrspace(1) %gep0 1295 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 1296 %shuf = shufflevector <2 x half> %add, <2 x half> undef, <2 x i32> <i32 1, i32 0> 1297 1298 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer) 1299 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1300 store <2 x half> %clamp, ptr addrspace(1) %out.gep 1301 ret void 1302} 1303 1304define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1305; SI-LABEL: v_no_clamp_add_src_v2f16_f32_src: 1306; SI: ; %bb.0: 1307; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1308; SI-NEXT: s_mov_b32 s7, 0xf000 1309; SI-NEXT: s_mov_b32 s6, 0 1310; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1311; SI-NEXT: v_mov_b32_e32 v1, 0 1312; SI-NEXT: s_waitcnt lgkmcnt(0) 1313; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1314; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1315; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1316; SI-NEXT: s_waitcnt vmcnt(0) 1317; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 1318; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1319; SI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 1320; SI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 1321; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1322; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1323; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1324; SI-NEXT: v_or_b32_e32 v2, v2, v3 1325; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1326; SI-NEXT: s_endpgm 1327; 1328; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src: 1329; GFX8: ; %bb.0: 1330; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1331; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX8-NEXT: v_mov_b32_e32 v1, s3 1334; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1335; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1336; GFX8-NEXT: flat_load_dword v3, v[0:1] 1337; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1338; GFX8-NEXT: v_mov_b32_e32 v1, s1 1339; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1340; GFX8-NEXT: s_waitcnt vmcnt(0) 1341; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3 1342; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1343; GFX8-NEXT: v_max_f16_e64 v2, v2, v2 clamp 1344; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 1345; GFX8-NEXT: flat_store_dword v[0:1], v2 1346; GFX8-NEXT: s_endpgm 1347; 1348; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src: 1349; GFX9: ; %bb.0: 1350; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1351; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1352; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1354; GFX9-NEXT: s_waitcnt vmcnt(0) 1355; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 1356; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 1357; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1358; GFX9-NEXT: s_endpgm 1359; 1360; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src: 1361; GFX11: ; %bb.0: 1362; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1363; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1364; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1365; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1366; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1368; GFX11-NEXT: s_waitcnt vmcnt(0) 1369; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 1370; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 1371; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1372; GFX11-NEXT: s_endpgm 1373 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1374 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 1375 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1376 %a = load <2 x half>, ptr addrspace(1) %gep0 1377 %bc = bitcast <2 x half> %a to float 1378 %f32.op = fadd float %bc, 1.0 1379 %f32.op.cast = bitcast float %f32.op to <2 x half> 1380 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %f32.op.cast, <2 x half> zeroinitializer) 1381 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1382 store <2 x half> %clamp, ptr addrspace(1) %out.gep 1383 ret void 1384} 1385 1386define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1387; SI-LABEL: v_no_clamp_add_packed_src_f32: 1388; SI: ; %bb.0: 1389; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1390; SI-NEXT: s_mov_b32 s7, 0xf000 1391; SI-NEXT: s_mov_b32 s6, 0 1392; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1393; SI-NEXT: v_mov_b32_e32 v1, 0 1394; SI-NEXT: s_waitcnt lgkmcnt(0) 1395; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1396; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1397; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1398; SI-NEXT: s_waitcnt vmcnt(0) 1399; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1400; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1401; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1402; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 1403; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1404; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 1405; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1406; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1407; SI-NEXT: v_or_b32_e32 v2, v2, v3 1408; SI-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1409; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1410; SI-NEXT: s_endpgm 1411; 1412; GFX8-LABEL: v_no_clamp_add_packed_src_f32: 1413; GFX8: ; %bb.0: 1414; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1415; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1416; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 1417; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1418; GFX8-NEXT: v_mov_b32_e32 v1, s3 1419; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1420; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1421; GFX8-NEXT: flat_load_dword v3, v[0:1] 1422; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1423; GFX8-NEXT: v_mov_b32_e32 v1, s1 1424; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1425; GFX8-NEXT: s_waitcnt vmcnt(0) 1426; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1427; GFX8-NEXT: v_add_f16_e32 v3, 1.0, v3 1428; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 1429; GFX8-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1430; GFX8-NEXT: flat_store_dword v[0:1], v2 1431; GFX8-NEXT: s_endpgm 1432; 1433; GFX9-LABEL: v_no_clamp_add_packed_src_f32: 1434; GFX9: ; %bb.0: 1435; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1436; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1438; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1439; GFX9-NEXT: s_waitcnt vmcnt(0) 1440; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1441; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1442; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1443; GFX9-NEXT: s_endpgm 1444; 1445; GFX11-LABEL: v_no_clamp_add_packed_src_f32: 1446; GFX11: ; %bb.0: 1447; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1448; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1449; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1450; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1451; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1452; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1453; GFX11-NEXT: s_waitcnt vmcnt(0) 1454; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 1455; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1456; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1457; GFX11-NEXT: s_endpgm 1458 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1459 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid 1460 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1461 %a = load <2 x half>, ptr addrspace(1) %gep0 1462 %add = fadd <2 x half> %a, <half 1.0, half 1.0> 1463 %bc.add = bitcast <2 x half> %add to float 1464 %max = call float @llvm.maxnum.f32(float %bc.add, float 0.0) 1465 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 1466 store float %clamp, ptr addrspace(1) %out.gep 1467 ret void 1468} 1469 1470; Since the high bits are zeroed, it probably would be OK in this case 1471; to use clamp. 1472define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { 1473; SI-LABEL: v_no_clamp_add_src_v2f16_f16_src: 1474; SI: ; %bb.0: 1475; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1476; SI-NEXT: s_mov_b32 s7, 0xf000 1477; SI-NEXT: s_mov_b32 s6, 0 1478; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1479; SI-NEXT: v_mov_b32_e32 v2, 0 1480; SI-NEXT: s_waitcnt lgkmcnt(0) 1481; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1482; SI-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 1483; SI-NEXT: v_cvt_f16_f32_e32 v3, 0 1484; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1485; SI-NEXT: s_waitcnt vmcnt(0) 1486; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1487; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 1488; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1489; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp 1490; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 1491; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1492; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 1493; SI-NEXT: v_or_b32_e32 v0, v4, v0 1494; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 1495; SI-NEXT: s_endpgm 1496; 1497; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src: 1498; GFX8: ; %bb.0: 1499; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1500; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1501; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1502; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX8-NEXT: v_mov_b32_e32 v2, s3 1504; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1505; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1506; GFX8-NEXT: flat_load_ushort v2, v[1:2] 1507; GFX8-NEXT: v_mov_b32_e32 v1, s1 1508; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1509; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1510; GFX8-NEXT: s_waitcnt vmcnt(0) 1511; GFX8-NEXT: v_add_f16_e64 v2, v2, 1.0 clamp 1512; GFX8-NEXT: flat_store_dword v[0:1], v2 1513; GFX8-NEXT: s_endpgm 1514; 1515; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src: 1516; GFX9: ; %bb.0: 1517; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1518; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1519; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1520; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1521; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] 1522; GFX9-NEXT: s_waitcnt vmcnt(0) 1523; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 1524; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 1525; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1526; GFX9-NEXT: s_endpgm 1527; 1528; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src: 1529; GFX11: ; %bb.0: 1530; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1531; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1532; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1533; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1534; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1535; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1536; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] 1537; GFX11-NEXT: s_waitcnt vmcnt(0) 1538; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 1539; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1540; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1541; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 1542; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1543; GFX11-NEXT: s_endpgm 1544 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1545 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid 1546 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1547 %a = load half, ptr addrspace(1) %gep0 1548 %add = fadd half %a, 1.0 1549 %bc = bitcast half %add to i16 1550 %zext = zext i16 %bc to i32 1551 %v2f16 = bitcast i32 %zext to <2 x half> 1552 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %v2f16, <2 x half> zeroinitializer) 1553 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1554 store <2 x half> %clamp, ptr addrspace(1) %out.gep 1555 ret void 1556} 1557 1558; FIXME: Worse code pre-gfx9 1559 1560define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 { 1561; SI-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm: 1562; SI: ; %bb.0: 1563; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1564; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 1565; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1566; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp 1567; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp 1568; SI-NEXT: s_setpc_b64 s[30:31] 1569; 1570; GFX8-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm: 1571; GFX8: ; %bb.0: 1572; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1573; GFX8-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 1574; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1575; GFX8-NEXT: v_max_f16_e64 v0, v0, v0 clamp 1576; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 1577; GFX8-NEXT: s_setpc_b64 s[30:31] 1578; 1579; GFX9-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm: 1580; GFX9: ; %bb.0: 1581; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1582; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 clamp 1583; GFX9-NEXT: s_setpc_b64 s[30:31] 1584; 1585; GFX11-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm: 1586; GFX11: ; %bb.0: 1587; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1588; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v0, v0, v1 clamp 1589; GFX11-NEXT: s_setpc_b64 s[30:31] 1590 %add = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) 1591 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer) 1592 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 1593 ret <2 x half> %clamp 1594} 1595 1596declare i32 @llvm.amdgcn.workitem.id.x() #1 1597declare float @llvm.fabs.f32(float) #1 1598declare float @llvm.floor.f32(float) #1 1599declare float @llvm.minnum.f32(float, float) #1 1600declare float @llvm.maxnum.f32(float, float) #1 1601declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 1602declare double @llvm.fabs.f64(double) #1 1603declare double @llvm.minnum.f64(double, double) #1 1604declare double @llvm.maxnum.f64(double, double) #1 1605declare half @llvm.fabs.f16(half) #1 1606declare half @llvm.minnum.f16(half, half) #1 1607declare half @llvm.maxnum.f16(half, half) #1 1608declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 1609declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 1610declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 1611declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 1612declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 1613 1614 1615declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 1616 1617attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1618attributes #1 = { nounwind readnone } 1619attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } 1620attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "denormal-fp-math"="preserve-sign,preserve-sign" } 1621 1622!llvm.dbg.cu = !{!0} 1623!llvm.module.flags = !{!2, !3} 1624 1625!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) 1626!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") 1627!2 = !{i32 2, !"Dwarf Version", i32 4} 1628!3 = !{i32 2, !"Debug Info Version", i32 3} 1629!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1) 1630!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) 1631!6 = !DISubroutineType(types: !7) 1632!7 = !{null, !8} 1633!8 = !DIBasicType(name: "float", size: 32, align: 32) 1634!9 = !DIExpression() 1635!10 = !DILocation(line: 1, column: 42, scope: !5) 1636