1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s 6 7; Test using saddr addressing mode of global_* flat atomic instructions. 8 9; -------------------------------------------------------------------------------- 10; atomicrmw max 11; -------------------------------------------------------------------------------- 12 13define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 14; GFX9-LABEL: global_max_saddr_i32_rtn: 15; GFX9: ; %bb.0: 16; GFX9-NEXT: v_mov_b32_e32 v2, v0 17; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 18; GFX9-NEXT: v_mov_b32_e32 v3, s3 19; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 20; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 21; GFX9-NEXT: s_mov_b64 s[0:1], 0 22; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start 23; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 24; GFX9-NEXT: s_waitcnt vmcnt(0) 25; GFX9-NEXT: v_mov_b32_e32 v5, v0 26; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 27; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 28; GFX9-NEXT: s_waitcnt vmcnt(0) 29; GFX9-NEXT: buffer_wbinvl1 30; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 31; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 32; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 33; GFX9-NEXT: s_cbranch_execnz .LBB0_1 34; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 35; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 36; GFX9-NEXT: ; return to shader part epilog 37; 38; GFX10-LABEL: global_max_saddr_i32_rtn: 39; GFX10: ; %bb.0: 40; GFX10-NEXT: v_mov_b32_e32 v2, v0 41; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 42; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 43; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 44; GFX10-NEXT: s_mov_b64 s[0:1], 0 45; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start 46; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 47; GFX10-NEXT: s_waitcnt vmcnt(0) 48; GFX10-NEXT: v_mov_b32_e32 v5, v0 49; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 50; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 51; GFX10-NEXT: s_waitcnt vmcnt(0) 52; GFX10-NEXT: buffer_gl1_inv 53; GFX10-NEXT: buffer_gl0_inv 54; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 55; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 56; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 57; GFX10-NEXT: s_cbranch_execnz .LBB0_1 58; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 59; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 60; GFX10-NEXT: ; return to shader part epilog 61; 62; GFX11-LABEL: global_max_saddr_i32_rtn: 63; GFX11: ; %bb.0: 64; GFX11-NEXT: v_mov_b32_e32 v2, v0 65; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 66; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 67; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 68; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 69; GFX11-NEXT: s_mov_b64 s[0:1], 0 70; GFX11-NEXT: s_waitcnt_depctr 0xfffe 71; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start 72; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 73; GFX11-NEXT: s_waitcnt vmcnt(0) 74; GFX11-NEXT: v_mov_b32_e32 v5, v0 75; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 76; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 77; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 78; GFX11-NEXT: s_waitcnt vmcnt(0) 79; GFX11-NEXT: buffer_gl1_inv 80; GFX11-NEXT: buffer_gl0_inv 81; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 82; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 83; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 84; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 85; GFX11-NEXT: s_cbranch_execnz .LBB0_1 86; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 87; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 88; GFX11-NEXT: ; return to shader part epilog 89; 90; GFX12-LABEL: global_max_saddr_i32_rtn: 91; GFX12: ; %bb.0: 92; GFX12-NEXT: v_mov_b32_e32 v2, v0 93; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] 94; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 95; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 96; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 97; GFX12-NEXT: s_mov_b64 s[0:1], 0 98; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start 99; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 100; GFX12-NEXT: s_wait_loadcnt 0x0 101; GFX12-NEXT: v_mov_b32_e32 v5, v0 102; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 103; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 104; GFX12-NEXT: global_wb scope:SCOPE_SYS 105; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 106; GFX12-NEXT: s_wait_loadcnt 0x0 107; GFX12-NEXT: global_inv scope:SCOPE_SYS 108; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 109; GFX12-NEXT: s_wait_alu 0xfffe 110; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 111; GFX12-NEXT: s_wait_alu 0xfffe 112; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 113; GFX12-NEXT: s_cbranch_execnz .LBB0_1 114; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 115; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 116; GFX12-NEXT: ; return to shader part epilog 117 %zext.offset = zext i32 %voffset to i64 118 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 119 %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst 120 %cast.rtn = bitcast i32 %rtn to float 121 ret float %cast.rtn 122} 123 124define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 125; GFX9-LABEL: global_max_saddr_i32_rtn_neg128: 126; GFX9: ; %bb.0: 127; GFX9-NEXT: v_mov_b32_e32 v2, v0 128; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 129; GFX9-NEXT: v_mov_b32_e32 v3, s3 130; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 131; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 132; GFX9-NEXT: s_mov_b64 s[0:1], 0 133; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start 134; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 135; GFX9-NEXT: s_waitcnt vmcnt(0) 136; GFX9-NEXT: v_mov_b32_e32 v5, v0 137; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 138; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 139; GFX9-NEXT: s_waitcnt vmcnt(0) 140; GFX9-NEXT: buffer_wbinvl1 141; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 142; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 143; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 144; GFX9-NEXT: s_cbranch_execnz .LBB1_1 145; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 146; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 147; GFX9-NEXT: ; return to shader part epilog 148; 149; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: v_mov_b32_e32 v2, v0 152; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 153; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 154; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 155; GFX10-NEXT: s_mov_b64 s[0:1], 0 156; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start 157; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 158; GFX10-NEXT: s_waitcnt vmcnt(0) 159; GFX10-NEXT: v_mov_b32_e32 v5, v0 160; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 161; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 162; GFX10-NEXT: s_waitcnt vmcnt(0) 163; GFX10-NEXT: buffer_gl1_inv 164; GFX10-NEXT: buffer_gl0_inv 165; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 166; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 167; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 168; GFX10-NEXT: s_cbranch_execnz .LBB1_1 169; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 170; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 171; GFX10-NEXT: ; return to shader part epilog 172; 173; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: 174; GFX11: ; %bb.0: 175; GFX11-NEXT: v_mov_b32_e32 v2, v0 176; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 177; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 178; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 179; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 180; GFX11-NEXT: s_mov_b64 s[0:1], 0 181; GFX11-NEXT: s_waitcnt_depctr 0xfffe 182; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start 183; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 184; GFX11-NEXT: s_waitcnt vmcnt(0) 185; GFX11-NEXT: v_mov_b32_e32 v5, v0 186; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 187; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 188; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 189; GFX11-NEXT: s_waitcnt vmcnt(0) 190; GFX11-NEXT: buffer_gl1_inv 191; GFX11-NEXT: buffer_gl0_inv 192; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 193; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 194; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 195; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 196; GFX11-NEXT: s_cbranch_execnz .LBB1_1 197; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 198; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 199; GFX11-NEXT: ; return to shader part epilog 200; 201; GFX12-LABEL: global_max_saddr_i32_rtn_neg128: 202; GFX12: ; %bb.0: 203; GFX12-NEXT: v_mov_b32_e32 v2, v0 204; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 205; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 206; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 207; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 208; GFX12-NEXT: s_mov_b64 s[0:1], 0 209; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start 210; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 211; GFX12-NEXT: s_wait_loadcnt 0x0 212; GFX12-NEXT: v_mov_b32_e32 v5, v0 213; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 214; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 215; GFX12-NEXT: global_wb scope:SCOPE_SYS 216; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 217; GFX12-NEXT: s_wait_loadcnt 0x0 218; GFX12-NEXT: global_inv scope:SCOPE_SYS 219; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 220; GFX12-NEXT: s_wait_alu 0xfffe 221; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 222; GFX12-NEXT: s_wait_alu 0xfffe 223; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 224; GFX12-NEXT: s_cbranch_execnz .LBB1_1 225; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 226; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 227; GFX12-NEXT: ; return to shader part epilog 228 %zext.offset = zext i32 %voffset to i64 229 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 230 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 231 %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst 232 %cast.rtn = bitcast i32 %rtn to float 233 ret float %cast.rtn 234} 235 236define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 237; GFX9-LABEL: global_max_saddr_i32_nortn: 238; GFX9: ; %bb.0: 239; GFX9-NEXT: global_load_dword v5, v0, s[2:3] 240; GFX9-NEXT: v_mov_b32_e32 v3, s3 241; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 242; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 243; GFX9-NEXT: s_mov_b64 s[0:1], 0 244; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start 245; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 246; GFX9-NEXT: s_waitcnt vmcnt(0) 247; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 248; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 249; GFX9-NEXT: s_waitcnt vmcnt(0) 250; GFX9-NEXT: buffer_wbinvl1 251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 252; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 253; GFX9-NEXT: v_mov_b32_e32 v5, v0 254; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 255; GFX9-NEXT: s_cbranch_execnz .LBB2_1 256; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 257; GFX9-NEXT: s_endpgm 258; 259; GFX10-LABEL: global_max_saddr_i32_nortn: 260; GFX10: ; %bb.0: 261; GFX10-NEXT: global_load_dword v5, v0, s[2:3] 262; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 263; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 264; GFX10-NEXT: s_mov_b64 s[0:1], 0 265; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start 266; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 267; GFX10-NEXT: s_waitcnt vmcnt(0) 268; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 269; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 270; GFX10-NEXT: s_waitcnt vmcnt(0) 271; GFX10-NEXT: buffer_gl1_inv 272; GFX10-NEXT: buffer_gl0_inv 273; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 274; GFX10-NEXT: v_mov_b32_e32 v5, v0 275; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 276; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 277; GFX10-NEXT: s_cbranch_execnz .LBB2_1 278; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 279; GFX10-NEXT: s_endpgm 280; 281; GFX11-LABEL: global_max_saddr_i32_nortn: 282; GFX11: ; %bb.0: 283; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] 284; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 285; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 286; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 287; GFX11-NEXT: s_mov_b64 s[0:1], 0 288; GFX11-NEXT: s_waitcnt_depctr 0xfffe 289; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start 290; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 291; GFX11-NEXT: s_waitcnt vmcnt(0) 292; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 293; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 294; GFX11-NEXT: s_waitcnt vmcnt(0) 295; GFX11-NEXT: buffer_gl1_inv 296; GFX11-NEXT: buffer_gl0_inv 297; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 298; GFX11-NEXT: v_mov_b32_e32 v5, v0 299; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 300; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 301; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 302; GFX11-NEXT: s_cbranch_execnz .LBB2_1 303; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 304; GFX11-NEXT: s_endpgm 305; 306; GFX12-LABEL: global_max_saddr_i32_nortn: 307; GFX12: ; %bb.0: 308; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] 309; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 310; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 311; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 312; GFX12-NEXT: s_mov_b64 s[0:1], 0 313; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start 314; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 315; GFX12-NEXT: s_wait_loadcnt 0x0 316; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 317; GFX12-NEXT: global_wb scope:SCOPE_SYS 318; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 319; GFX12-NEXT: s_wait_loadcnt 0x0 320; GFX12-NEXT: global_inv scope:SCOPE_SYS 321; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 322; GFX12-NEXT: v_mov_b32_e32 v5, v0 323; GFX12-NEXT: s_wait_alu 0xfffe 324; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 325; GFX12-NEXT: s_wait_alu 0xfffe 326; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 327; GFX12-NEXT: s_cbranch_execnz .LBB2_1 328; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 329; GFX12-NEXT: s_endpgm 330 %zext.offset = zext i32 %voffset to i64 331 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 332 %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst 333 ret void 334} 335 336define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 337; GFX9-LABEL: global_max_saddr_i32_nortn_neg128: 338; GFX9: ; %bb.0: 339; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 340; GFX9-NEXT: v_mov_b32_e32 v3, s3 341; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 342; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 343; GFX9-NEXT: s_mov_b64 s[0:1], 0 344; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start 345; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 346; GFX9-NEXT: s_waitcnt vmcnt(0) 347; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 348; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 349; GFX9-NEXT: s_waitcnt vmcnt(0) 350; GFX9-NEXT: buffer_wbinvl1 351; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 352; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 353; GFX9-NEXT: v_mov_b32_e32 v5, v0 354; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 355; GFX9-NEXT: s_cbranch_execnz .LBB3_1 356; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 357; GFX9-NEXT: s_endpgm 358; 359; GFX10-LABEL: global_max_saddr_i32_nortn_neg128: 360; GFX10: ; %bb.0: 361; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 362; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 363; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 364; GFX10-NEXT: s_mov_b64 s[0:1], 0 365; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start 366; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 367; GFX10-NEXT: s_waitcnt vmcnt(0) 368; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 369; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 370; GFX10-NEXT: s_waitcnt vmcnt(0) 371; GFX10-NEXT: buffer_gl1_inv 372; GFX10-NEXT: buffer_gl0_inv 373; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 374; GFX10-NEXT: v_mov_b32_e32 v5, v0 375; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 376; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 377; GFX10-NEXT: s_cbranch_execnz .LBB3_1 378; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 379; GFX10-NEXT: s_endpgm 380; 381; GFX11-LABEL: global_max_saddr_i32_nortn_neg128: 382; GFX11: ; %bb.0: 383; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 384; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 385; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 386; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 387; GFX11-NEXT: s_mov_b64 s[0:1], 0 388; GFX11-NEXT: s_waitcnt_depctr 0xfffe 389; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start 390; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 391; GFX11-NEXT: s_waitcnt vmcnt(0) 392; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 393; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 394; GFX11-NEXT: s_waitcnt vmcnt(0) 395; GFX11-NEXT: buffer_gl1_inv 396; GFX11-NEXT: buffer_gl0_inv 397; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 398; GFX11-NEXT: v_mov_b32_e32 v5, v0 399; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 400; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 401; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 402; GFX11-NEXT: s_cbranch_execnz .LBB3_1 403; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 404; GFX11-NEXT: s_endpgm 405; 406; GFX12-LABEL: global_max_saddr_i32_nortn_neg128: 407; GFX12: ; %bb.0: 408; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 409; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 410; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 411; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 412; GFX12-NEXT: s_mov_b64 s[0:1], 0 413; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start 414; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 415; GFX12-NEXT: s_wait_loadcnt 0x0 416; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 417; GFX12-NEXT: global_wb scope:SCOPE_SYS 418; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 419; GFX12-NEXT: s_wait_loadcnt 0x0 420; GFX12-NEXT: global_inv scope:SCOPE_SYS 421; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 422; GFX12-NEXT: v_mov_b32_e32 v5, v0 423; GFX12-NEXT: s_wait_alu 0xfffe 424; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 425; GFX12-NEXT: s_wait_alu 0xfffe 426; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 427; GFX12-NEXT: s_cbranch_execnz .LBB3_1 428; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 429; GFX12-NEXT: s_endpgm 430 %zext.offset = zext i32 %voffset to i64 431 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 432 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 433 %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst 434 ret void 435} 436 437define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 438; GFX9-LABEL: global_max_saddr_i64_rtn: 439; GFX9: ; %bb.0: 440; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 441; GFX9-NEXT: v_mov_b32_e32 v6, s3 442; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 443; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 444; GFX9-NEXT: s_mov_b64 s[0:1], 0 445; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start 446; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 447; GFX9-NEXT: s_waitcnt vmcnt(0) 448; GFX9-NEXT: v_mov_b32_e32 v10, v4 449; GFX9-NEXT: v_mov_b32_e32 v9, v3 450; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 451; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 452; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 453; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 454; GFX9-NEXT: s_waitcnt vmcnt(0) 455; GFX9-NEXT: buffer_wbinvl1 456; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 457; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 458; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 459; GFX9-NEXT: s_cbranch_execnz .LBB4_1 460; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 461; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 462; GFX9-NEXT: v_mov_b32_e32 v0, v3 463; GFX9-NEXT: v_mov_b32_e32 v1, v4 464; GFX9-NEXT: ; return to shader part epilog 465; 466; GFX10-LABEL: global_max_saddr_i64_rtn: 467; GFX10: ; %bb.0: 468; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 469; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 470; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 471; GFX10-NEXT: s_mov_b64 s[0:1], 0 472; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start 473; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 474; GFX10-NEXT: s_waitcnt vmcnt(0) 475; GFX10-NEXT: v_mov_b32_e32 v10, v4 476; GFX10-NEXT: v_mov_b32_e32 v9, v3 477; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 478; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 479; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 480; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 481; GFX10-NEXT: s_waitcnt vmcnt(0) 482; GFX10-NEXT: buffer_gl1_inv 483; GFX10-NEXT: buffer_gl0_inv 484; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 485; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 486; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 487; GFX10-NEXT: s_cbranch_execnz .LBB4_1 488; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 489; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 490; GFX10-NEXT: v_mov_b32_e32 v0, v3 491; GFX10-NEXT: v_mov_b32_e32 v1, v4 492; GFX10-NEXT: ; return to shader part epilog 493; 494; GFX11-LABEL: global_max_saddr_i64_rtn: 495; GFX11: ; %bb.0: 496; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] 497; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 498; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 499; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 500; GFX11-NEXT: s_mov_b64 s[0:1], 0 501; GFX11-NEXT: s_waitcnt_depctr 0xfffe 502; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start 503; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 504; GFX11-NEXT: s_waitcnt vmcnt(0) 505; GFX11-NEXT: v_mov_b32_e32 v10, v4 506; GFX11-NEXT: v_mov_b32_e32 v9, v3 507; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 508; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 509; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 510; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 511; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc 512; GFX11-NEXT: s_waitcnt vmcnt(0) 513; GFX11-NEXT: buffer_gl1_inv 514; GFX11-NEXT: buffer_gl0_inv 515; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 516; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 517; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 518; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 519; GFX11-NEXT: s_cbranch_execnz .LBB4_1 520; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 521; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 522; GFX11-NEXT: v_mov_b32_e32 v0, v3 523; GFX11-NEXT: v_mov_b32_e32 v1, v4 524; GFX11-NEXT: ; return to shader part epilog 525; 526; GFX12-LABEL: global_max_saddr_i64_rtn: 527; GFX12: ; %bb.0: 528; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] 529; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 530; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 531; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 532; GFX12-NEXT: s_mov_b64 s[0:1], 0 533; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start 534; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 535; GFX12-NEXT: s_wait_loadcnt 0x0 536; GFX12-NEXT: v_mov_b32_e32 v10, v4 537; GFX12-NEXT: v_mov_b32_e32 v9, v3 538; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 539; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 540; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 541; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 542; GFX12-NEXT: global_wb scope:SCOPE_SYS 543; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 544; GFX12-NEXT: s_wait_loadcnt 0x0 545; GFX12-NEXT: global_inv scope:SCOPE_SYS 546; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 547; GFX12-NEXT: s_wait_alu 0xfffe 548; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 549; GFX12-NEXT: s_wait_alu 0xfffe 550; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 551; GFX12-NEXT: s_cbranch_execnz .LBB4_1 552; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 553; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 554; GFX12-NEXT: v_mov_b32_e32 v0, v3 555; GFX12-NEXT: v_mov_b32_e32 v1, v4 556; GFX12-NEXT: ; return to shader part epilog 557 %zext.offset = zext i32 %voffset to i64 558 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 559 %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst 560 %cast.rtn = bitcast i64 %rtn to <2 x float> 561 ret <2 x float> %cast.rtn 562} 563 564define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 565; GFX9-LABEL: global_max_saddr_i64_rtn_neg128: 566; GFX9: ; %bb.0: 567; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 568; GFX9-NEXT: v_mov_b32_e32 v6, s3 569; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 570; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 571; GFX9-NEXT: s_mov_b64 s[0:1], 0 572; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start 573; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 574; GFX9-NEXT: s_waitcnt vmcnt(0) 575; GFX9-NEXT: v_mov_b32_e32 v10, v4 576; GFX9-NEXT: v_mov_b32_e32 v9, v3 577; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 578; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 579; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 580; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 581; GFX9-NEXT: s_waitcnt vmcnt(0) 582; GFX9-NEXT: buffer_wbinvl1 583; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 584; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 585; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 586; GFX9-NEXT: s_cbranch_execnz .LBB5_1 587; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 588; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 589; GFX9-NEXT: v_mov_b32_e32 v0, v3 590; GFX9-NEXT: v_mov_b32_e32 v1, v4 591; GFX9-NEXT: ; return to shader part epilog 592; 593; GFX10-LABEL: global_max_saddr_i64_rtn_neg128: 594; GFX10: ; %bb.0: 595; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 596; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 597; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 598; GFX10-NEXT: s_mov_b64 s[0:1], 0 599; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start 600; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 601; GFX10-NEXT: s_waitcnt vmcnt(0) 602; GFX10-NEXT: v_mov_b32_e32 v10, v4 603; GFX10-NEXT: v_mov_b32_e32 v9, v3 604; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 605; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 606; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 607; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 608; GFX10-NEXT: s_waitcnt vmcnt(0) 609; GFX10-NEXT: buffer_gl1_inv 610; GFX10-NEXT: buffer_gl0_inv 611; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 612; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 613; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 614; GFX10-NEXT: s_cbranch_execnz .LBB5_1 615; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 616; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 617; GFX10-NEXT: v_mov_b32_e32 v0, v3 618; GFX10-NEXT: v_mov_b32_e32 v1, v4 619; GFX10-NEXT: ; return to shader part epilog 620; 621; GFX11-LABEL: global_max_saddr_i64_rtn_neg128: 622; GFX11: ; %bb.0: 623; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 624; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 625; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 626; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 627; GFX11-NEXT: s_mov_b64 s[0:1], 0 628; GFX11-NEXT: s_waitcnt_depctr 0xfffe 629; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start 630; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 631; GFX11-NEXT: s_waitcnt vmcnt(0) 632; GFX11-NEXT: v_mov_b32_e32 v10, v4 633; GFX11-NEXT: v_mov_b32_e32 v9, v3 634; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 635; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 636; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 637; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 638; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc 639; GFX11-NEXT: s_waitcnt vmcnt(0) 640; GFX11-NEXT: buffer_gl1_inv 641; GFX11-NEXT: buffer_gl0_inv 642; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 643; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 644; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 645; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 646; GFX11-NEXT: s_cbranch_execnz .LBB5_1 647; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 648; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 649; GFX11-NEXT: v_mov_b32_e32 v0, v3 650; GFX11-NEXT: v_mov_b32_e32 v1, v4 651; GFX11-NEXT: ; return to shader part epilog 652; 653; GFX12-LABEL: global_max_saddr_i64_rtn_neg128: 654; GFX12: ; %bb.0: 655; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 656; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 657; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 658; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 659; GFX12-NEXT: s_mov_b64 s[0:1], 0 660; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start 661; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 662; GFX12-NEXT: s_wait_loadcnt 0x0 663; GFX12-NEXT: v_mov_b32_e32 v10, v4 664; GFX12-NEXT: v_mov_b32_e32 v9, v3 665; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 666; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] 667; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 668; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 669; GFX12-NEXT: global_wb scope:SCOPE_SYS 670; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 671; GFX12-NEXT: s_wait_loadcnt 0x0 672; GFX12-NEXT: global_inv scope:SCOPE_SYS 673; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 674; GFX12-NEXT: s_wait_alu 0xfffe 675; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 676; GFX12-NEXT: s_wait_alu 0xfffe 677; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 678; GFX12-NEXT: s_cbranch_execnz .LBB5_1 679; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 680; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 681; GFX12-NEXT: v_mov_b32_e32 v0, v3 682; GFX12-NEXT: v_mov_b32_e32 v1, v4 683; GFX12-NEXT: ; return to shader part epilog 684 %zext.offset = zext i32 %voffset to i64 685 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 686 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 687 %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst 688 %cast.rtn = bitcast i64 %rtn to <2 x float> 689 ret <2 x float> %cast.rtn 690} 691 692define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 693; GFX9-LABEL: global_max_saddr_i64_nortn: 694; GFX9: ; %bb.0: 695; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 696; GFX9-NEXT: v_mov_b32_e32 v3, s3 697; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 698; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 699; GFX9-NEXT: s_mov_b64 s[0:1], 0 700; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start 701; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 702; GFX9-NEXT: s_waitcnt vmcnt(0) 703; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 704; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 705; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 706; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 707; GFX9-NEXT: s_waitcnt vmcnt(0) 708; GFX9-NEXT: buffer_wbinvl1 709; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 710; GFX9-NEXT: v_mov_b32_e32 v6, v4 711; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 712; GFX9-NEXT: v_mov_b32_e32 v5, v3 713; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 714; GFX9-NEXT: s_cbranch_execnz .LBB6_1 715; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 716; GFX9-NEXT: s_endpgm 717; 718; GFX10-LABEL: global_max_saddr_i64_nortn: 719; GFX10: ; %bb.0: 720; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 721; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 722; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 723; GFX10-NEXT: s_mov_b64 s[0:1], 0 724; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start 725; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 726; GFX10-NEXT: s_waitcnt vmcnt(0) 727; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 728; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 729; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 730; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 731; GFX10-NEXT: s_waitcnt vmcnt(0) 732; GFX10-NEXT: buffer_gl1_inv 733; GFX10-NEXT: buffer_gl0_inv 734; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 735; GFX10-NEXT: v_mov_b32_e32 v6, v4 736; GFX10-NEXT: v_mov_b32_e32 v5, v3 737; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 738; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 739; GFX10-NEXT: s_cbranch_execnz .LBB6_1 740; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 741; GFX10-NEXT: s_endpgm 742; 743; GFX11-LABEL: global_max_saddr_i64_nortn: 744; GFX11: ; %bb.0: 745; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] 746; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 747; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 748; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 749; GFX11-NEXT: s_mov_b64 s[0:1], 0 750; GFX11-NEXT: s_waitcnt_depctr 0xfffe 751; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start 752; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 753; GFX11-NEXT: s_waitcnt vmcnt(0) 754; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 755; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 756; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 757; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc 758; GFX11-NEXT: s_waitcnt vmcnt(0) 759; GFX11-NEXT: buffer_gl1_inv 760; GFX11-NEXT: buffer_gl0_inv 761; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 762; GFX11-NEXT: v_mov_b32_e32 v6, v4 763; GFX11-NEXT: v_mov_b32_e32 v5, v3 764; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 765; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 766; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 767; GFX11-NEXT: s_cbranch_execnz .LBB6_1 768; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 769; GFX11-NEXT: s_endpgm 770; 771; GFX12-LABEL: global_max_saddr_i64_nortn: 772; GFX12: ; %bb.0: 773; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] 774; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 775; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 776; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 777; GFX12-NEXT: s_mov_b64 s[0:1], 0 778; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start 779; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 780; GFX12-NEXT: s_wait_loadcnt 0x0 781; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 782; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 783; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 784; GFX12-NEXT: global_wb scope:SCOPE_SYS 785; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 786; GFX12-NEXT: s_wait_loadcnt 0x0 787; GFX12-NEXT: global_inv scope:SCOPE_SYS 788; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 789; GFX12-NEXT: v_mov_b32_e32 v6, v4 790; GFX12-NEXT: v_mov_b32_e32 v5, v3 791; GFX12-NEXT: s_wait_alu 0xfffe 792; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 793; GFX12-NEXT: s_wait_alu 0xfffe 794; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 795; GFX12-NEXT: s_cbranch_execnz .LBB6_1 796; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 797; GFX12-NEXT: s_endpgm 798 %zext.offset = zext i32 %voffset to i64 799 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 800 %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst 801 ret void 802} 803 804define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 805; GFX9-LABEL: global_max_saddr_i64_nortn_neg128: 806; GFX9: ; %bb.0: 807; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 808; GFX9-NEXT: v_mov_b32_e32 v3, s3 809; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 810; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 811; GFX9-NEXT: s_mov_b64 s[0:1], 0 812; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start 813; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 814; GFX9-NEXT: s_waitcnt vmcnt(0) 815; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 816; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 817; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 818; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 819; GFX9-NEXT: s_waitcnt vmcnt(0) 820; GFX9-NEXT: buffer_wbinvl1 821; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 822; GFX9-NEXT: v_mov_b32_e32 v6, v4 823; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 824; GFX9-NEXT: v_mov_b32_e32 v5, v3 825; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 826; GFX9-NEXT: s_cbranch_execnz .LBB7_1 827; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 828; GFX9-NEXT: s_endpgm 829; 830; GFX10-LABEL: global_max_saddr_i64_nortn_neg128: 831; GFX10: ; %bb.0: 832; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 833; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 834; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 835; GFX10-NEXT: s_mov_b64 s[0:1], 0 836; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start 837; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 838; GFX10-NEXT: s_waitcnt vmcnt(0) 839; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 840; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 841; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 842; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 843; GFX10-NEXT: s_waitcnt vmcnt(0) 844; GFX10-NEXT: buffer_gl1_inv 845; GFX10-NEXT: buffer_gl0_inv 846; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 847; GFX10-NEXT: v_mov_b32_e32 v6, v4 848; GFX10-NEXT: v_mov_b32_e32 v5, v3 849; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 850; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 851; GFX10-NEXT: s_cbranch_execnz .LBB7_1 852; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 853; GFX10-NEXT: s_endpgm 854; 855; GFX11-LABEL: global_max_saddr_i64_nortn_neg128: 856; GFX11: ; %bb.0: 857; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 858; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 859; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 860; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 861; GFX11-NEXT: s_mov_b64 s[0:1], 0 862; GFX11-NEXT: s_waitcnt_depctr 0xfffe 863; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start 864; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 865; GFX11-NEXT: s_waitcnt vmcnt(0) 866; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 867; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 868; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 869; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc 870; GFX11-NEXT: s_waitcnt vmcnt(0) 871; GFX11-NEXT: buffer_gl1_inv 872; GFX11-NEXT: buffer_gl0_inv 873; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 874; GFX11-NEXT: v_mov_b32_e32 v6, v4 875; GFX11-NEXT: v_mov_b32_e32 v5, v3 876; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 877; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 878; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 879; GFX11-NEXT: s_cbranch_execnz .LBB7_1 880; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 881; GFX11-NEXT: s_endpgm 882; 883; GFX12-LABEL: global_max_saddr_i64_nortn_neg128: 884; GFX12: ; %bb.0: 885; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 886; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 887; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 888; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 889; GFX12-NEXT: s_mov_b64 s[0:1], 0 890; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start 891; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 892; GFX12-NEXT: s_wait_loadcnt 0x0 893; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] 894; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 895; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 896; GFX12-NEXT: global_wb scope:SCOPE_SYS 897; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 898; GFX12-NEXT: s_wait_loadcnt 0x0 899; GFX12-NEXT: global_inv scope:SCOPE_SYS 900; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 901; GFX12-NEXT: v_mov_b32_e32 v6, v4 902; GFX12-NEXT: v_mov_b32_e32 v5, v3 903; GFX12-NEXT: s_wait_alu 0xfffe 904; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 905; GFX12-NEXT: s_wait_alu 0xfffe 906; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 907; GFX12-NEXT: s_cbranch_execnz .LBB7_1 908; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 909; GFX12-NEXT: s_endpgm 910 %zext.offset = zext i32 %voffset to i64 911 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 912 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 913 %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst 914 ret void 915} 916 917; -------------------------------------------------------------------------------- 918; atomicrmw min 919; -------------------------------------------------------------------------------- 920 921define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 922; GFX9-LABEL: global_min_saddr_i32_rtn: 923; GFX9: ; %bb.0: 924; GFX9-NEXT: v_mov_b32_e32 v2, v0 925; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 926; GFX9-NEXT: v_mov_b32_e32 v3, s3 927; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 928; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 929; GFX9-NEXT: s_mov_b64 s[0:1], 0 930; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start 931; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 932; GFX9-NEXT: s_waitcnt vmcnt(0) 933; GFX9-NEXT: v_mov_b32_e32 v5, v0 934; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 935; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 936; GFX9-NEXT: s_waitcnt vmcnt(0) 937; GFX9-NEXT: buffer_wbinvl1 938; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 939; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 940; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 941; GFX9-NEXT: s_cbranch_execnz .LBB8_1 942; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 943; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 944; GFX9-NEXT: ; return to shader part epilog 945; 946; GFX10-LABEL: global_min_saddr_i32_rtn: 947; GFX10: ; %bb.0: 948; GFX10-NEXT: v_mov_b32_e32 v2, v0 949; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 950; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 951; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 952; GFX10-NEXT: s_mov_b64 s[0:1], 0 953; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start 954; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 955; GFX10-NEXT: s_waitcnt vmcnt(0) 956; GFX10-NEXT: v_mov_b32_e32 v5, v0 957; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 958; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 959; GFX10-NEXT: s_waitcnt vmcnt(0) 960; GFX10-NEXT: buffer_gl1_inv 961; GFX10-NEXT: buffer_gl0_inv 962; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 963; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 964; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 965; GFX10-NEXT: s_cbranch_execnz .LBB8_1 966; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 967; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 968; GFX10-NEXT: ; return to shader part epilog 969; 970; GFX11-LABEL: global_min_saddr_i32_rtn: 971; GFX11: ; %bb.0: 972; GFX11-NEXT: v_mov_b32_e32 v2, v0 973; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 974; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 975; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 976; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 977; GFX11-NEXT: s_mov_b64 s[0:1], 0 978; GFX11-NEXT: s_waitcnt_depctr 0xfffe 979; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start 980; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 981; GFX11-NEXT: s_waitcnt vmcnt(0) 982; GFX11-NEXT: v_mov_b32_e32 v5, v0 983; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 984; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 985; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 986; GFX11-NEXT: s_waitcnt vmcnt(0) 987; GFX11-NEXT: buffer_gl1_inv 988; GFX11-NEXT: buffer_gl0_inv 989; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 990; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 991; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 992; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 993; GFX11-NEXT: s_cbranch_execnz .LBB8_1 994; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 995; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 996; GFX11-NEXT: ; return to shader part epilog 997; 998; GFX12-LABEL: global_min_saddr_i32_rtn: 999; GFX12: ; %bb.0: 1000; GFX12-NEXT: v_mov_b32_e32 v2, v0 1001; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] 1002; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1003; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1004; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1005; GFX12-NEXT: s_mov_b64 s[0:1], 0 1006; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start 1007; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1008; GFX12-NEXT: s_wait_loadcnt 0x0 1009; GFX12-NEXT: v_mov_b32_e32 v5, v0 1010; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1011; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 1012; GFX12-NEXT: global_wb scope:SCOPE_SYS 1013; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1014; GFX12-NEXT: s_wait_loadcnt 0x0 1015; GFX12-NEXT: global_inv scope:SCOPE_SYS 1016; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1017; GFX12-NEXT: s_wait_alu 0xfffe 1018; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1019; GFX12-NEXT: s_wait_alu 0xfffe 1020; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1021; GFX12-NEXT: s_cbranch_execnz .LBB8_1 1022; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1023; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 1024; GFX12-NEXT: ; return to shader part epilog 1025 %zext.offset = zext i32 %voffset to i64 1026 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1027 %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst 1028 %cast.rtn = bitcast i32 %rtn to float 1029 ret float %cast.rtn 1030} 1031 1032define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 1033; GFX9-LABEL: global_min_saddr_i32_rtn_neg128: 1034; GFX9: ; %bb.0: 1035; GFX9-NEXT: v_mov_b32_e32 v2, v0 1036; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1037; GFX9-NEXT: v_mov_b32_e32 v3, s3 1038; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 1039; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1040; GFX9-NEXT: s_mov_b64 s[0:1], 0 1041; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start 1042; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1043; GFX9-NEXT: s_waitcnt vmcnt(0) 1044; GFX9-NEXT: v_mov_b32_e32 v5, v0 1045; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 1046; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 1047; GFX9-NEXT: s_waitcnt vmcnt(0) 1048; GFX9-NEXT: buffer_wbinvl1 1049; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1050; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1051; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1052; GFX9-NEXT: s_cbranch_execnz .LBB9_1 1053; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1054; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1055; GFX9-NEXT: ; return to shader part epilog 1056; 1057; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: 1058; GFX10: ; %bb.0: 1059; GFX10-NEXT: v_mov_b32_e32 v2, v0 1060; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1061; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1062; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 1063; GFX10-NEXT: s_mov_b64 s[0:1], 0 1064; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start 1065; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1066; GFX10-NEXT: s_waitcnt vmcnt(0) 1067; GFX10-NEXT: v_mov_b32_e32 v5, v0 1068; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 1069; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 1070; GFX10-NEXT: s_waitcnt vmcnt(0) 1071; GFX10-NEXT: buffer_gl1_inv 1072; GFX10-NEXT: buffer_gl0_inv 1073; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1074; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1075; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1076; GFX10-NEXT: s_cbranch_execnz .LBB9_1 1077; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1078; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 1079; GFX10-NEXT: ; return to shader part epilog 1080; 1081; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: 1082; GFX11: ; %bb.0: 1083; GFX11-NEXT: v_mov_b32_e32 v2, v0 1084; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1085; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1086; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1087; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1088; GFX11-NEXT: s_mov_b64 s[0:1], 0 1089; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1090; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start 1091; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1092; GFX11-NEXT: s_waitcnt vmcnt(0) 1093; GFX11-NEXT: v_mov_b32_e32 v5, v0 1094; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1095; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 1096; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 1097; GFX11-NEXT: s_waitcnt vmcnt(0) 1098; GFX11-NEXT: buffer_gl1_inv 1099; GFX11-NEXT: buffer_gl0_inv 1100; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1101; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1102; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1103; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1104; GFX11-NEXT: s_cbranch_execnz .LBB9_1 1105; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1106; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 1107; GFX11-NEXT: ; return to shader part epilog 1108; 1109; GFX12-LABEL: global_min_saddr_i32_rtn_neg128: 1110; GFX12: ; %bb.0: 1111; GFX12-NEXT: v_mov_b32_e32 v2, v0 1112; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1113; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1114; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1115; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1116; GFX12-NEXT: s_mov_b64 s[0:1], 0 1117; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start 1118; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1119; GFX12-NEXT: s_wait_loadcnt 0x0 1120; GFX12-NEXT: v_mov_b32_e32 v5, v0 1121; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1122; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 1123; GFX12-NEXT: global_wb scope:SCOPE_SYS 1124; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1125; GFX12-NEXT: s_wait_loadcnt 0x0 1126; GFX12-NEXT: global_inv scope:SCOPE_SYS 1127; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1128; GFX12-NEXT: s_wait_alu 0xfffe 1129; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1130; GFX12-NEXT: s_wait_alu 0xfffe 1131; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1132; GFX12-NEXT: s_cbranch_execnz .LBB9_1 1133; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1134; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 1135; GFX12-NEXT: ; return to shader part epilog 1136 %zext.offset = zext i32 %voffset to i64 1137 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1138 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 1139 %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst 1140 %cast.rtn = bitcast i32 %rtn to float 1141 ret float %cast.rtn 1142} 1143 1144define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 1145; GFX9-LABEL: global_min_saddr_i32_nortn: 1146; GFX9: ; %bb.0: 1147; GFX9-NEXT: global_load_dword v5, v0, s[2:3] 1148; GFX9-NEXT: v_mov_b32_e32 v3, s3 1149; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 1150; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1151; GFX9-NEXT: s_mov_b64 s[0:1], 0 1152; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start 1153; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1154; GFX9-NEXT: s_waitcnt vmcnt(0) 1155; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 1156; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 1157; GFX9-NEXT: s_waitcnt vmcnt(0) 1158; GFX9-NEXT: buffer_wbinvl1 1159; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1160; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1161; GFX9-NEXT: v_mov_b32_e32 v5, v0 1162; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1163; GFX9-NEXT: s_cbranch_execnz .LBB10_1 1164; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1165; GFX9-NEXT: s_endpgm 1166; 1167; GFX10-LABEL: global_min_saddr_i32_nortn: 1168; GFX10: ; %bb.0: 1169; GFX10-NEXT: global_load_dword v5, v0, s[2:3] 1170; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 1171; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 1172; GFX10-NEXT: s_mov_b64 s[0:1], 0 1173; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start 1174; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1175; GFX10-NEXT: s_waitcnt vmcnt(0) 1176; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 1177; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 1178; GFX10-NEXT: s_waitcnt vmcnt(0) 1179; GFX10-NEXT: buffer_gl1_inv 1180; GFX10-NEXT: buffer_gl0_inv 1181; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1182; GFX10-NEXT: v_mov_b32_e32 v5, v0 1183; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1184; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1185; GFX10-NEXT: s_cbranch_execnz .LBB10_1 1186; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1187; GFX10-NEXT: s_endpgm 1188; 1189; GFX11-LABEL: global_min_saddr_i32_nortn: 1190; GFX11: ; %bb.0: 1191; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] 1192; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 1193; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1194; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1195; GFX11-NEXT: s_mov_b64 s[0:1], 0 1196; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1197; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start 1198; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1199; GFX11-NEXT: s_waitcnt vmcnt(0) 1200; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 1201; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 1202; GFX11-NEXT: s_waitcnt vmcnt(0) 1203; GFX11-NEXT: buffer_gl1_inv 1204; GFX11-NEXT: buffer_gl0_inv 1205; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1206; GFX11-NEXT: v_mov_b32_e32 v5, v0 1207; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1208; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1209; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1210; GFX11-NEXT: s_cbranch_execnz .LBB10_1 1211; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1212; GFX11-NEXT: s_endpgm 1213; 1214; GFX12-LABEL: global_min_saddr_i32_nortn: 1215; GFX12: ; %bb.0: 1216; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] 1217; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 1218; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1219; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1220; GFX12-NEXT: s_mov_b64 s[0:1], 0 1221; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start 1222; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1223; GFX12-NEXT: s_wait_loadcnt 0x0 1224; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 1225; GFX12-NEXT: global_wb scope:SCOPE_SYS 1226; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1227; GFX12-NEXT: s_wait_loadcnt 0x0 1228; GFX12-NEXT: global_inv scope:SCOPE_SYS 1229; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1230; GFX12-NEXT: v_mov_b32_e32 v5, v0 1231; GFX12-NEXT: s_wait_alu 0xfffe 1232; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1233; GFX12-NEXT: s_wait_alu 0xfffe 1234; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1235; GFX12-NEXT: s_cbranch_execnz .LBB10_1 1236; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1237; GFX12-NEXT: s_endpgm 1238 %zext.offset = zext i32 %voffset to i64 1239 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1240 %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst 1241 ret void 1242} 1243 1244define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 1245; GFX9-LABEL: global_min_saddr_i32_nortn_neg128: 1246; GFX9: ; %bb.0: 1247; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 1248; GFX9-NEXT: v_mov_b32_e32 v3, s3 1249; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 1250; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1251; GFX9-NEXT: s_mov_b64 s[0:1], 0 1252; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start 1253; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1254; GFX9-NEXT: s_waitcnt vmcnt(0) 1255; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 1256; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 1257; GFX9-NEXT: s_waitcnt vmcnt(0) 1258; GFX9-NEXT: buffer_wbinvl1 1259; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1260; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1261; GFX9-NEXT: v_mov_b32_e32 v5, v0 1262; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1263; GFX9-NEXT: s_cbranch_execnz .LBB11_1 1264; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1265; GFX9-NEXT: s_endpgm 1266; 1267; GFX10-LABEL: global_min_saddr_i32_nortn_neg128: 1268; GFX10: ; %bb.0: 1269; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 1270; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 1271; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 1272; GFX10-NEXT: s_mov_b64 s[0:1], 0 1273; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start 1274; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1275; GFX10-NEXT: s_waitcnt vmcnt(0) 1276; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 1277; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 1278; GFX10-NEXT: s_waitcnt vmcnt(0) 1279; GFX10-NEXT: buffer_gl1_inv 1280; GFX10-NEXT: buffer_gl0_inv 1281; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1282; GFX10-NEXT: v_mov_b32_e32 v5, v0 1283; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1284; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1285; GFX10-NEXT: s_cbranch_execnz .LBB11_1 1286; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1287; GFX10-NEXT: s_endpgm 1288; 1289; GFX11-LABEL: global_min_saddr_i32_nortn_neg128: 1290; GFX11: ; %bb.0: 1291; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 1292; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 1293; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1294; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1295; GFX11-NEXT: s_mov_b64 s[0:1], 0 1296; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1297; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start 1298; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1299; GFX11-NEXT: s_waitcnt vmcnt(0) 1300; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 1301; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 1302; GFX11-NEXT: s_waitcnt vmcnt(0) 1303; GFX11-NEXT: buffer_gl1_inv 1304; GFX11-NEXT: buffer_gl0_inv 1305; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1306; GFX11-NEXT: v_mov_b32_e32 v5, v0 1307; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1308; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1309; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1310; GFX11-NEXT: s_cbranch_execnz .LBB11_1 1311; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1312; GFX11-NEXT: s_endpgm 1313; 1314; GFX12-LABEL: global_min_saddr_i32_nortn_neg128: 1315; GFX12: ; %bb.0: 1316; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 1317; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 1318; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1319; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1320; GFX12-NEXT: s_mov_b64 s[0:1], 0 1321; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start 1322; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1323; GFX12-NEXT: s_wait_loadcnt 0x0 1324; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 1325; GFX12-NEXT: global_wb scope:SCOPE_SYS 1326; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1327; GFX12-NEXT: s_wait_loadcnt 0x0 1328; GFX12-NEXT: global_inv scope:SCOPE_SYS 1329; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1330; GFX12-NEXT: v_mov_b32_e32 v5, v0 1331; GFX12-NEXT: s_wait_alu 0xfffe 1332; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1333; GFX12-NEXT: s_wait_alu 0xfffe 1334; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1335; GFX12-NEXT: s_cbranch_execnz .LBB11_1 1336; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1337; GFX12-NEXT: s_endpgm 1338 %zext.offset = zext i32 %voffset to i64 1339 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1340 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 1341 %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst 1342 ret void 1343} 1344 1345define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 1346; GFX9-LABEL: global_min_saddr_i64_rtn: 1347; GFX9: ; %bb.0: 1348; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 1349; GFX9-NEXT: v_mov_b32_e32 v6, s3 1350; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 1351; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 1352; GFX9-NEXT: s_mov_b64 s[0:1], 0 1353; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start 1354; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1355; GFX9-NEXT: s_waitcnt vmcnt(0) 1356; GFX9-NEXT: v_mov_b32_e32 v10, v4 1357; GFX9-NEXT: v_mov_b32_e32 v9, v3 1358; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1359; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1360; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1361; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 1362; GFX9-NEXT: s_waitcnt vmcnt(0) 1363; GFX9-NEXT: buffer_wbinvl1 1364; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1365; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1366; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1367; GFX9-NEXT: s_cbranch_execnz .LBB12_1 1368; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1369; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1370; GFX9-NEXT: v_mov_b32_e32 v0, v3 1371; GFX9-NEXT: v_mov_b32_e32 v1, v4 1372; GFX9-NEXT: ; return to shader part epilog 1373; 1374; GFX10-LABEL: global_min_saddr_i64_rtn: 1375; GFX10: ; %bb.0: 1376; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 1377; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 1378; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 1379; GFX10-NEXT: s_mov_b64 s[0:1], 0 1380; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start 1381; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1382; GFX10-NEXT: s_waitcnt vmcnt(0) 1383; GFX10-NEXT: v_mov_b32_e32 v10, v4 1384; GFX10-NEXT: v_mov_b32_e32 v9, v3 1385; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1386; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1387; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1388; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 1389; GFX10-NEXT: s_waitcnt vmcnt(0) 1390; GFX10-NEXT: buffer_gl1_inv 1391; GFX10-NEXT: buffer_gl0_inv 1392; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1393; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1394; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1395; GFX10-NEXT: s_cbranch_execnz .LBB12_1 1396; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1397; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 1398; GFX10-NEXT: v_mov_b32_e32 v0, v3 1399; GFX10-NEXT: v_mov_b32_e32 v1, v4 1400; GFX10-NEXT: ; return to shader part epilog 1401; 1402; GFX11-LABEL: global_min_saddr_i64_rtn: 1403; GFX11: ; %bb.0: 1404; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] 1405; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 1406; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1407; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 1408; GFX11-NEXT: s_mov_b64 s[0:1], 0 1409; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1410; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start 1411; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1412; GFX11-NEXT: s_waitcnt vmcnt(0) 1413; GFX11-NEXT: v_mov_b32_e32 v10, v4 1414; GFX11-NEXT: v_mov_b32_e32 v9, v3 1415; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1416; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1417; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1418; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1419; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc 1420; GFX11-NEXT: s_waitcnt vmcnt(0) 1421; GFX11-NEXT: buffer_gl1_inv 1422; GFX11-NEXT: buffer_gl0_inv 1423; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1424; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1425; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1426; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1427; GFX11-NEXT: s_cbranch_execnz .LBB12_1 1428; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1429; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 1430; GFX11-NEXT: v_mov_b32_e32 v0, v3 1431; GFX11-NEXT: v_mov_b32_e32 v1, v4 1432; GFX11-NEXT: ; return to shader part epilog 1433; 1434; GFX12-LABEL: global_min_saddr_i64_rtn: 1435; GFX12: ; %bb.0: 1436; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] 1437; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 1438; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1439; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 1440; GFX12-NEXT: s_mov_b64 s[0:1], 0 1441; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start 1442; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1443; GFX12-NEXT: s_wait_loadcnt 0x0 1444; GFX12-NEXT: v_mov_b32_e32 v10, v4 1445; GFX12-NEXT: v_mov_b32_e32 v9, v3 1446; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1447; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1448; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1449; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1450; GFX12-NEXT: global_wb scope:SCOPE_SYS 1451; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1452; GFX12-NEXT: s_wait_loadcnt 0x0 1453; GFX12-NEXT: global_inv scope:SCOPE_SYS 1454; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1455; GFX12-NEXT: s_wait_alu 0xfffe 1456; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1457; GFX12-NEXT: s_wait_alu 0xfffe 1458; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1459; GFX12-NEXT: s_cbranch_execnz .LBB12_1 1460; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1461; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 1462; GFX12-NEXT: v_mov_b32_e32 v0, v3 1463; GFX12-NEXT: v_mov_b32_e32 v1, v4 1464; GFX12-NEXT: ; return to shader part epilog 1465 %zext.offset = zext i32 %voffset to i64 1466 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1467 %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst 1468 %cast.rtn = bitcast i64 %rtn to <2 x float> 1469 ret <2 x float> %cast.rtn 1470} 1471 1472define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 1473; GFX9-LABEL: global_min_saddr_i64_rtn_neg128: 1474; GFX9: ; %bb.0: 1475; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 1476; GFX9-NEXT: v_mov_b32_e32 v6, s3 1477; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 1478; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 1479; GFX9-NEXT: s_mov_b64 s[0:1], 0 1480; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start 1481; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1482; GFX9-NEXT: s_waitcnt vmcnt(0) 1483; GFX9-NEXT: v_mov_b32_e32 v10, v4 1484; GFX9-NEXT: v_mov_b32_e32 v9, v3 1485; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1486; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1487; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1488; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 1489; GFX9-NEXT: s_waitcnt vmcnt(0) 1490; GFX9-NEXT: buffer_wbinvl1 1491; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1492; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1493; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1494; GFX9-NEXT: s_cbranch_execnz .LBB13_1 1495; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1496; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1497; GFX9-NEXT: v_mov_b32_e32 v0, v3 1498; GFX9-NEXT: v_mov_b32_e32 v1, v4 1499; GFX9-NEXT: ; return to shader part epilog 1500; 1501; GFX10-LABEL: global_min_saddr_i64_rtn_neg128: 1502; GFX10: ; %bb.0: 1503; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 1504; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 1505; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 1506; GFX10-NEXT: s_mov_b64 s[0:1], 0 1507; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start 1508; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1509; GFX10-NEXT: s_waitcnt vmcnt(0) 1510; GFX10-NEXT: v_mov_b32_e32 v10, v4 1511; GFX10-NEXT: v_mov_b32_e32 v9, v3 1512; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1513; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1514; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1515; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 1516; GFX10-NEXT: s_waitcnt vmcnt(0) 1517; GFX10-NEXT: buffer_gl1_inv 1518; GFX10-NEXT: buffer_gl0_inv 1519; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1520; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1521; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1522; GFX10-NEXT: s_cbranch_execnz .LBB13_1 1523; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1524; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 1525; GFX10-NEXT: v_mov_b32_e32 v0, v3 1526; GFX10-NEXT: v_mov_b32_e32 v1, v4 1527; GFX10-NEXT: ; return to shader part epilog 1528; 1529; GFX11-LABEL: global_min_saddr_i64_rtn_neg128: 1530; GFX11: ; %bb.0: 1531; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 1532; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 1533; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1534; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 1535; GFX11-NEXT: s_mov_b64 s[0:1], 0 1536; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1537; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start 1538; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1539; GFX11-NEXT: s_waitcnt vmcnt(0) 1540; GFX11-NEXT: v_mov_b32_e32 v10, v4 1541; GFX11-NEXT: v_mov_b32_e32 v9, v3 1542; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1543; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1544; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1545; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1546; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc 1547; GFX11-NEXT: s_waitcnt vmcnt(0) 1548; GFX11-NEXT: buffer_gl1_inv 1549; GFX11-NEXT: buffer_gl0_inv 1550; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1551; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1552; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1553; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1554; GFX11-NEXT: s_cbranch_execnz .LBB13_1 1555; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1556; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 1557; GFX11-NEXT: v_mov_b32_e32 v0, v3 1558; GFX11-NEXT: v_mov_b32_e32 v1, v4 1559; GFX11-NEXT: ; return to shader part epilog 1560; 1561; GFX12-LABEL: global_min_saddr_i64_rtn_neg128: 1562; GFX12: ; %bb.0: 1563; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 1564; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 1565; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1566; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 1567; GFX12-NEXT: s_mov_b64 s[0:1], 0 1568; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start 1569; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1570; GFX12-NEXT: s_wait_loadcnt 0x0 1571; GFX12-NEXT: v_mov_b32_e32 v10, v4 1572; GFX12-NEXT: v_mov_b32_e32 v9, v3 1573; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1574; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] 1575; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 1576; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 1577; GFX12-NEXT: global_wb scope:SCOPE_SYS 1578; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1579; GFX12-NEXT: s_wait_loadcnt 0x0 1580; GFX12-NEXT: global_inv scope:SCOPE_SYS 1581; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 1582; GFX12-NEXT: s_wait_alu 0xfffe 1583; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1584; GFX12-NEXT: s_wait_alu 0xfffe 1585; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1586; GFX12-NEXT: s_cbranch_execnz .LBB13_1 1587; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1588; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 1589; GFX12-NEXT: v_mov_b32_e32 v0, v3 1590; GFX12-NEXT: v_mov_b32_e32 v1, v4 1591; GFX12-NEXT: ; return to shader part epilog 1592 %zext.offset = zext i32 %voffset to i64 1593 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1594 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 1595 %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst 1596 %cast.rtn = bitcast i64 %rtn to <2 x float> 1597 ret <2 x float> %cast.rtn 1598} 1599 1600define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 1601; GFX9-LABEL: global_min_saddr_i64_nortn: 1602; GFX9: ; %bb.0: 1603; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 1604; GFX9-NEXT: v_mov_b32_e32 v3, s3 1605; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 1606; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 1607; GFX9-NEXT: s_mov_b64 s[0:1], 0 1608; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start 1609; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1610; GFX9-NEXT: s_waitcnt vmcnt(0) 1611; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1612; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1613; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1614; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 1615; GFX9-NEXT: s_waitcnt vmcnt(0) 1616; GFX9-NEXT: buffer_wbinvl1 1617; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1618; GFX9-NEXT: v_mov_b32_e32 v6, v4 1619; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1620; GFX9-NEXT: v_mov_b32_e32 v5, v3 1621; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1622; GFX9-NEXT: s_cbranch_execnz .LBB14_1 1623; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1624; GFX9-NEXT: s_endpgm 1625; 1626; GFX10-LABEL: global_min_saddr_i64_nortn: 1627; GFX10: ; %bb.0: 1628; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 1629; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 1630; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 1631; GFX10-NEXT: s_mov_b64 s[0:1], 0 1632; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start 1633; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1634; GFX10-NEXT: s_waitcnt vmcnt(0) 1635; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1636; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1637; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1638; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 1639; GFX10-NEXT: s_waitcnt vmcnt(0) 1640; GFX10-NEXT: buffer_gl1_inv 1641; GFX10-NEXT: buffer_gl0_inv 1642; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1643; GFX10-NEXT: v_mov_b32_e32 v6, v4 1644; GFX10-NEXT: v_mov_b32_e32 v5, v3 1645; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1646; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1647; GFX10-NEXT: s_cbranch_execnz .LBB14_1 1648; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1649; GFX10-NEXT: s_endpgm 1650; 1651; GFX11-LABEL: global_min_saddr_i64_nortn: 1652; GFX11: ; %bb.0: 1653; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] 1654; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 1655; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1656; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 1657; GFX11-NEXT: s_mov_b64 s[0:1], 0 1658; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1659; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start 1660; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1661; GFX11-NEXT: s_waitcnt vmcnt(0) 1662; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1663; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1664; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1665; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc 1666; GFX11-NEXT: s_waitcnt vmcnt(0) 1667; GFX11-NEXT: buffer_gl1_inv 1668; GFX11-NEXT: buffer_gl0_inv 1669; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1670; GFX11-NEXT: v_mov_b32_e32 v6, v4 1671; GFX11-NEXT: v_mov_b32_e32 v5, v3 1672; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1673; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1674; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1675; GFX11-NEXT: s_cbranch_execnz .LBB14_1 1676; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1677; GFX11-NEXT: s_endpgm 1678; 1679; GFX12-LABEL: global_min_saddr_i64_nortn: 1680; GFX12: ; %bb.0: 1681; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] 1682; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 1683; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1684; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 1685; GFX12-NEXT: s_mov_b64 s[0:1], 0 1686; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start 1687; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1688; GFX12-NEXT: s_wait_loadcnt 0x0 1689; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1690; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1691; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1692; GFX12-NEXT: global_wb scope:SCOPE_SYS 1693; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1694; GFX12-NEXT: s_wait_loadcnt 0x0 1695; GFX12-NEXT: global_inv scope:SCOPE_SYS 1696; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1697; GFX12-NEXT: v_mov_b32_e32 v6, v4 1698; GFX12-NEXT: v_mov_b32_e32 v5, v3 1699; GFX12-NEXT: s_wait_alu 0xfffe 1700; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1701; GFX12-NEXT: s_wait_alu 0xfffe 1702; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1703; GFX12-NEXT: s_cbranch_execnz .LBB14_1 1704; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1705; GFX12-NEXT: s_endpgm 1706 %zext.offset = zext i32 %voffset to i64 1707 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1708 %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst 1709 ret void 1710} 1711 1712define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 1713; GFX9-LABEL: global_min_saddr_i64_nortn_neg128: 1714; GFX9: ; %bb.0: 1715; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 1716; GFX9-NEXT: v_mov_b32_e32 v3, s3 1717; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 1718; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 1719; GFX9-NEXT: s_mov_b64 s[0:1], 0 1720; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start 1721; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1722; GFX9-NEXT: s_waitcnt vmcnt(0) 1723; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1724; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1725; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1726; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 1727; GFX9-NEXT: s_waitcnt vmcnt(0) 1728; GFX9-NEXT: buffer_wbinvl1 1729; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1730; GFX9-NEXT: v_mov_b32_e32 v6, v4 1731; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1732; GFX9-NEXT: v_mov_b32_e32 v5, v3 1733; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1734; GFX9-NEXT: s_cbranch_execnz .LBB15_1 1735; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1736; GFX9-NEXT: s_endpgm 1737; 1738; GFX10-LABEL: global_min_saddr_i64_nortn_neg128: 1739; GFX10: ; %bb.0: 1740; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 1741; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 1742; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 1743; GFX10-NEXT: s_mov_b64 s[0:1], 0 1744; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start 1745; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1746; GFX10-NEXT: s_waitcnt vmcnt(0) 1747; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1748; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1749; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1750; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 1751; GFX10-NEXT: s_waitcnt vmcnt(0) 1752; GFX10-NEXT: buffer_gl1_inv 1753; GFX10-NEXT: buffer_gl0_inv 1754; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1755; GFX10-NEXT: v_mov_b32_e32 v6, v4 1756; GFX10-NEXT: v_mov_b32_e32 v5, v3 1757; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1758; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1759; GFX10-NEXT: s_cbranch_execnz .LBB15_1 1760; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1761; GFX10-NEXT: s_endpgm 1762; 1763; GFX11-LABEL: global_min_saddr_i64_nortn_neg128: 1764; GFX11: ; %bb.0: 1765; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 1766; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 1767; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1768; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 1769; GFX11-NEXT: s_mov_b64 s[0:1], 0 1770; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1771; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start 1772; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1773; GFX11-NEXT: s_waitcnt vmcnt(0) 1774; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1775; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1776; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1777; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc 1778; GFX11-NEXT: s_waitcnt vmcnt(0) 1779; GFX11-NEXT: buffer_gl1_inv 1780; GFX11-NEXT: buffer_gl0_inv 1781; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1782; GFX11-NEXT: v_mov_b32_e32 v6, v4 1783; GFX11-NEXT: v_mov_b32_e32 v5, v3 1784; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1785; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1786; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1787; GFX11-NEXT: s_cbranch_execnz .LBB15_1 1788; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1789; GFX11-NEXT: s_endpgm 1790; 1791; GFX12-LABEL: global_min_saddr_i64_nortn_neg128: 1792; GFX12: ; %bb.0: 1793; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 1794; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 1795; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1796; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 1797; GFX12-NEXT: s_mov_b64 s[0:1], 0 1798; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start 1799; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1800; GFX12-NEXT: s_wait_loadcnt 0x0 1801; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] 1802; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 1803; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 1804; GFX12-NEXT: global_wb scope:SCOPE_SYS 1805; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1806; GFX12-NEXT: s_wait_loadcnt 0x0 1807; GFX12-NEXT: global_inv scope:SCOPE_SYS 1808; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 1809; GFX12-NEXT: v_mov_b32_e32 v6, v4 1810; GFX12-NEXT: v_mov_b32_e32 v5, v3 1811; GFX12-NEXT: s_wait_alu 0xfffe 1812; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1813; GFX12-NEXT: s_wait_alu 0xfffe 1814; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1815; GFX12-NEXT: s_cbranch_execnz .LBB15_1 1816; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1817; GFX12-NEXT: s_endpgm 1818 %zext.offset = zext i32 %voffset to i64 1819 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1820 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 1821 %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst 1822 ret void 1823} 1824 1825; -------------------------------------------------------------------------------- 1826; atomicrmw umax 1827; -------------------------------------------------------------------------------- 1828 1829define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 1830; GFX9-LABEL: global_umax_saddr_i32_rtn: 1831; GFX9: ; %bb.0: 1832; GFX9-NEXT: v_mov_b32_e32 v2, v0 1833; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 1834; GFX9-NEXT: v_mov_b32_e32 v3, s3 1835; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 1836; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1837; GFX9-NEXT: s_mov_b64 s[0:1], 0 1838; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start 1839; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1840; GFX9-NEXT: s_waitcnt vmcnt(0) 1841; GFX9-NEXT: v_mov_b32_e32 v5, v0 1842; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 1843; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 1844; GFX9-NEXT: s_waitcnt vmcnt(0) 1845; GFX9-NEXT: buffer_wbinvl1 1846; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1847; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1848; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1849; GFX9-NEXT: s_cbranch_execnz .LBB16_1 1850; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1851; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1852; GFX9-NEXT: ; return to shader part epilog 1853; 1854; GFX10-LABEL: global_umax_saddr_i32_rtn: 1855; GFX10: ; %bb.0: 1856; GFX10-NEXT: v_mov_b32_e32 v2, v0 1857; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1858; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1859; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 1860; GFX10-NEXT: s_mov_b64 s[0:1], 0 1861; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start 1862; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1863; GFX10-NEXT: s_waitcnt vmcnt(0) 1864; GFX10-NEXT: v_mov_b32_e32 v5, v0 1865; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 1866; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 1867; GFX10-NEXT: s_waitcnt vmcnt(0) 1868; GFX10-NEXT: buffer_gl1_inv 1869; GFX10-NEXT: buffer_gl0_inv 1870; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1871; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1872; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1873; GFX10-NEXT: s_cbranch_execnz .LBB16_1 1874; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1875; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 1876; GFX10-NEXT: ; return to shader part epilog 1877; 1878; GFX11-LABEL: global_umax_saddr_i32_rtn: 1879; GFX11: ; %bb.0: 1880; GFX11-NEXT: v_mov_b32_e32 v2, v0 1881; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1882; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1883; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1884; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1885; GFX11-NEXT: s_mov_b64 s[0:1], 0 1886; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1887; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start 1888; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1889; GFX11-NEXT: s_waitcnt vmcnt(0) 1890; GFX11-NEXT: v_mov_b32_e32 v5, v0 1891; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1892; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 1893; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 1894; GFX11-NEXT: s_waitcnt vmcnt(0) 1895; GFX11-NEXT: buffer_gl1_inv 1896; GFX11-NEXT: buffer_gl0_inv 1897; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1898; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1899; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1900; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1901; GFX11-NEXT: s_cbranch_execnz .LBB16_1 1902; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1903; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 1904; GFX11-NEXT: ; return to shader part epilog 1905; 1906; GFX12-LABEL: global_umax_saddr_i32_rtn: 1907; GFX12: ; %bb.0: 1908; GFX12-NEXT: v_mov_b32_e32 v2, v0 1909; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] 1910; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1911; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1912; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1913; GFX12-NEXT: s_mov_b64 s[0:1], 0 1914; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start 1915; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1916; GFX12-NEXT: s_wait_loadcnt 0x0 1917; GFX12-NEXT: v_mov_b32_e32 v5, v0 1918; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1919; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 1920; GFX12-NEXT: global_wb scope:SCOPE_SYS 1921; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1922; GFX12-NEXT: s_wait_loadcnt 0x0 1923; GFX12-NEXT: global_inv scope:SCOPE_SYS 1924; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1925; GFX12-NEXT: s_wait_alu 0xfffe 1926; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1927; GFX12-NEXT: s_wait_alu 0xfffe 1928; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 1929; GFX12-NEXT: s_cbranch_execnz .LBB16_1 1930; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1931; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 1932; GFX12-NEXT: ; return to shader part epilog 1933 %zext.offset = zext i32 %voffset to i64 1934 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 1935 %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst 1936 %cast.rtn = bitcast i32 %rtn to float 1937 ret float %cast.rtn 1938} 1939 1940define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 1941; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128: 1942; GFX9: ; %bb.0: 1943; GFX9-NEXT: v_mov_b32_e32 v2, v0 1944; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1945; GFX9-NEXT: v_mov_b32_e32 v3, s3 1946; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 1947; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1948; GFX9-NEXT: s_mov_b64 s[0:1], 0 1949; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start 1950; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1951; GFX9-NEXT: s_waitcnt vmcnt(0) 1952; GFX9-NEXT: v_mov_b32_e32 v5, v0 1953; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 1954; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 1955; GFX9-NEXT: s_waitcnt vmcnt(0) 1956; GFX9-NEXT: buffer_wbinvl1 1957; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1958; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1959; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 1960; GFX9-NEXT: s_cbranch_execnz .LBB17_1 1961; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 1962; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1963; GFX9-NEXT: ; return to shader part epilog 1964; 1965; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: 1966; GFX10: ; %bb.0: 1967; GFX10-NEXT: v_mov_b32_e32 v2, v0 1968; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1969; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1970; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 1971; GFX10-NEXT: s_mov_b64 s[0:1], 0 1972; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start 1973; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1974; GFX10-NEXT: s_waitcnt vmcnt(0) 1975; GFX10-NEXT: v_mov_b32_e32 v5, v0 1976; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 1977; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 1978; GFX10-NEXT: s_waitcnt vmcnt(0) 1979; GFX10-NEXT: buffer_gl1_inv 1980; GFX10-NEXT: buffer_gl0_inv 1981; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1982; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1983; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 1984; GFX10-NEXT: s_cbranch_execnz .LBB17_1 1985; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1986; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 1987; GFX10-NEXT: ; return to shader part epilog 1988; 1989; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: 1990; GFX11: ; %bb.0: 1991; GFX11-NEXT: v_mov_b32_e32 v2, v0 1992; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1993; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 1994; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1995; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 1996; GFX11-NEXT: s_mov_b64 s[0:1], 0 1997; GFX11-NEXT: s_waitcnt_depctr 0xfffe 1998; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start 1999; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2000; GFX11-NEXT: s_waitcnt vmcnt(0) 2001; GFX11-NEXT: v_mov_b32_e32 v5, v0 2002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2003; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 2004; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 2005; GFX11-NEXT: s_waitcnt vmcnt(0) 2006; GFX11-NEXT: buffer_gl1_inv 2007; GFX11-NEXT: buffer_gl0_inv 2008; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2009; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2010; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2011; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2012; GFX11-NEXT: s_cbranch_execnz .LBB17_1 2013; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2014; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 2015; GFX11-NEXT: ; return to shader part epilog 2016; 2017; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128: 2018; GFX12: ; %bb.0: 2019; GFX12-NEXT: v_mov_b32_e32 v2, v0 2020; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 2021; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2022; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2023; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2024; GFX12-NEXT: s_mov_b64 s[0:1], 0 2025; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start 2026; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2027; GFX12-NEXT: s_wait_loadcnt 0x0 2028; GFX12-NEXT: v_mov_b32_e32 v5, v0 2029; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2030; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 2031; GFX12-NEXT: global_wb scope:SCOPE_SYS 2032; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2033; GFX12-NEXT: s_wait_loadcnt 0x0 2034; GFX12-NEXT: global_inv scope:SCOPE_SYS 2035; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2036; GFX12-NEXT: s_wait_alu 0xfffe 2037; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2038; GFX12-NEXT: s_wait_alu 0xfffe 2039; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2040; GFX12-NEXT: s_cbranch_execnz .LBB17_1 2041; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2042; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 2043; GFX12-NEXT: ; return to shader part epilog 2044 %zext.offset = zext i32 %voffset to i64 2045 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2046 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 2047 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst 2048 %cast.rtn = bitcast i32 %rtn to float 2049 ret float %cast.rtn 2050} 2051 2052define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 2053; GFX9-LABEL: global_umax_saddr_i32_nortn: 2054; GFX9: ; %bb.0: 2055; GFX9-NEXT: global_load_dword v5, v0, s[2:3] 2056; GFX9-NEXT: v_mov_b32_e32 v3, s3 2057; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 2058; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2059; GFX9-NEXT: s_mov_b64 s[0:1], 0 2060; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start 2061; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2062; GFX9-NEXT: s_waitcnt vmcnt(0) 2063; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 2064; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 2065; GFX9-NEXT: s_waitcnt vmcnt(0) 2066; GFX9-NEXT: buffer_wbinvl1 2067; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2068; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2069; GFX9-NEXT: v_mov_b32_e32 v5, v0 2070; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2071; GFX9-NEXT: s_cbranch_execnz .LBB18_1 2072; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2073; GFX9-NEXT: s_endpgm 2074; 2075; GFX10-LABEL: global_umax_saddr_i32_nortn: 2076; GFX10: ; %bb.0: 2077; GFX10-NEXT: global_load_dword v5, v0, s[2:3] 2078; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2079; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 2080; GFX10-NEXT: s_mov_b64 s[0:1], 0 2081; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start 2082; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2083; GFX10-NEXT: s_waitcnt vmcnt(0) 2084; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 2085; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 2086; GFX10-NEXT: s_waitcnt vmcnt(0) 2087; GFX10-NEXT: buffer_gl1_inv 2088; GFX10-NEXT: buffer_gl0_inv 2089; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2090; GFX10-NEXT: v_mov_b32_e32 v5, v0 2091; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2092; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2093; GFX10-NEXT: s_cbranch_execnz .LBB18_1 2094; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2095; GFX10-NEXT: s_endpgm 2096; 2097; GFX11-LABEL: global_umax_saddr_i32_nortn: 2098; GFX11: ; %bb.0: 2099; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] 2100; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2101; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2102; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2103; GFX11-NEXT: s_mov_b64 s[0:1], 0 2104; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2105; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start 2106; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2107; GFX11-NEXT: s_waitcnt vmcnt(0) 2108; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 2109; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 2110; GFX11-NEXT: s_waitcnt vmcnt(0) 2111; GFX11-NEXT: buffer_gl1_inv 2112; GFX11-NEXT: buffer_gl0_inv 2113; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2114; GFX11-NEXT: v_mov_b32_e32 v5, v0 2115; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2116; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2117; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2118; GFX11-NEXT: s_cbranch_execnz .LBB18_1 2119; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2120; GFX11-NEXT: s_endpgm 2121; 2122; GFX12-LABEL: global_umax_saddr_i32_nortn: 2123; GFX12: ; %bb.0: 2124; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] 2125; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2126; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2127; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2128; GFX12-NEXT: s_mov_b64 s[0:1], 0 2129; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start 2130; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2131; GFX12-NEXT: s_wait_loadcnt 0x0 2132; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 2133; GFX12-NEXT: global_wb scope:SCOPE_SYS 2134; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2135; GFX12-NEXT: s_wait_loadcnt 0x0 2136; GFX12-NEXT: global_inv scope:SCOPE_SYS 2137; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2138; GFX12-NEXT: v_mov_b32_e32 v5, v0 2139; GFX12-NEXT: s_wait_alu 0xfffe 2140; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2141; GFX12-NEXT: s_wait_alu 0xfffe 2142; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2143; GFX12-NEXT: s_cbranch_execnz .LBB18_1 2144; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2145; GFX12-NEXT: s_endpgm 2146 %zext.offset = zext i32 %voffset to i64 2147 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2148 %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst 2149 ret void 2150} 2151 2152define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 2153; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128: 2154; GFX9: ; %bb.0: 2155; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 2156; GFX9-NEXT: v_mov_b32_e32 v3, s3 2157; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 2158; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2159; GFX9-NEXT: s_mov_b64 s[0:1], 0 2160; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start 2161; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2162; GFX9-NEXT: s_waitcnt vmcnt(0) 2163; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 2164; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 2165; GFX9-NEXT: s_waitcnt vmcnt(0) 2166; GFX9-NEXT: buffer_wbinvl1 2167; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2168; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2169; GFX9-NEXT: v_mov_b32_e32 v5, v0 2170; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2171; GFX9-NEXT: s_cbranch_execnz .LBB19_1 2172; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2173; GFX9-NEXT: s_endpgm 2174; 2175; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128: 2176; GFX10: ; %bb.0: 2177; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 2178; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2179; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 2180; GFX10-NEXT: s_mov_b64 s[0:1], 0 2181; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start 2182; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2183; GFX10-NEXT: s_waitcnt vmcnt(0) 2184; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 2185; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 2186; GFX10-NEXT: s_waitcnt vmcnt(0) 2187; GFX10-NEXT: buffer_gl1_inv 2188; GFX10-NEXT: buffer_gl0_inv 2189; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2190; GFX10-NEXT: v_mov_b32_e32 v5, v0 2191; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2192; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2193; GFX10-NEXT: s_cbranch_execnz .LBB19_1 2194; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2195; GFX10-NEXT: s_endpgm 2196; 2197; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128: 2198; GFX11: ; %bb.0: 2199; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 2200; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2201; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2202; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2203; GFX11-NEXT: s_mov_b64 s[0:1], 0 2204; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2205; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start 2206; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2207; GFX11-NEXT: s_waitcnt vmcnt(0) 2208; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 2209; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 2210; GFX11-NEXT: s_waitcnt vmcnt(0) 2211; GFX11-NEXT: buffer_gl1_inv 2212; GFX11-NEXT: buffer_gl0_inv 2213; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2214; GFX11-NEXT: v_mov_b32_e32 v5, v0 2215; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2216; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2217; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2218; GFX11-NEXT: s_cbranch_execnz .LBB19_1 2219; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2220; GFX11-NEXT: s_endpgm 2221; 2222; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128: 2223; GFX12: ; %bb.0: 2224; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 2225; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2226; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2227; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2228; GFX12-NEXT: s_mov_b64 s[0:1], 0 2229; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start 2230; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2231; GFX12-NEXT: s_wait_loadcnt 0x0 2232; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 2233; GFX12-NEXT: global_wb scope:SCOPE_SYS 2234; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2235; GFX12-NEXT: s_wait_loadcnt 0x0 2236; GFX12-NEXT: global_inv scope:SCOPE_SYS 2237; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2238; GFX12-NEXT: v_mov_b32_e32 v5, v0 2239; GFX12-NEXT: s_wait_alu 0xfffe 2240; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2241; GFX12-NEXT: s_wait_alu 0xfffe 2242; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2243; GFX12-NEXT: s_cbranch_execnz .LBB19_1 2244; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2245; GFX12-NEXT: s_endpgm 2246 %zext.offset = zext i32 %voffset to i64 2247 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2248 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 2249 %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst 2250 ret void 2251} 2252 2253define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 2254; GFX9-LABEL: global_umax_saddr_i64_rtn: 2255; GFX9: ; %bb.0: 2256; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 2257; GFX9-NEXT: v_mov_b32_e32 v6, s3 2258; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 2259; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 2260; GFX9-NEXT: s_mov_b64 s[0:1], 0 2261; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start 2262; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2263; GFX9-NEXT: s_waitcnt vmcnt(0) 2264; GFX9-NEXT: v_mov_b32_e32 v10, v4 2265; GFX9-NEXT: v_mov_b32_e32 v9, v3 2266; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2267; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2268; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2269; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 2270; GFX9-NEXT: s_waitcnt vmcnt(0) 2271; GFX9-NEXT: buffer_wbinvl1 2272; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2273; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2274; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2275; GFX9-NEXT: s_cbranch_execnz .LBB20_1 2276; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2277; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2278; GFX9-NEXT: v_mov_b32_e32 v0, v3 2279; GFX9-NEXT: v_mov_b32_e32 v1, v4 2280; GFX9-NEXT: ; return to shader part epilog 2281; 2282; GFX10-LABEL: global_umax_saddr_i64_rtn: 2283; GFX10: ; %bb.0: 2284; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 2285; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 2286; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 2287; GFX10-NEXT: s_mov_b64 s[0:1], 0 2288; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start 2289; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2290; GFX10-NEXT: s_waitcnt vmcnt(0) 2291; GFX10-NEXT: v_mov_b32_e32 v10, v4 2292; GFX10-NEXT: v_mov_b32_e32 v9, v3 2293; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2294; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2295; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2296; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 2297; GFX10-NEXT: s_waitcnt vmcnt(0) 2298; GFX10-NEXT: buffer_gl1_inv 2299; GFX10-NEXT: buffer_gl0_inv 2300; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2301; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2302; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2303; GFX10-NEXT: s_cbranch_execnz .LBB20_1 2304; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2305; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 2306; GFX10-NEXT: v_mov_b32_e32 v0, v3 2307; GFX10-NEXT: v_mov_b32_e32 v1, v4 2308; GFX10-NEXT: ; return to shader part epilog 2309; 2310; GFX11-LABEL: global_umax_saddr_i64_rtn: 2311; GFX11: ; %bb.0: 2312; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] 2313; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 2314; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2315; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 2316; GFX11-NEXT: s_mov_b64 s[0:1], 0 2317; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2318; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start 2319; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2320; GFX11-NEXT: s_waitcnt vmcnt(0) 2321; GFX11-NEXT: v_mov_b32_e32 v10, v4 2322; GFX11-NEXT: v_mov_b32_e32 v9, v3 2323; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2324; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2325; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2326; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2327; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc 2328; GFX11-NEXT: s_waitcnt vmcnt(0) 2329; GFX11-NEXT: buffer_gl1_inv 2330; GFX11-NEXT: buffer_gl0_inv 2331; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2332; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2333; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2334; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2335; GFX11-NEXT: s_cbranch_execnz .LBB20_1 2336; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2337; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 2338; GFX11-NEXT: v_mov_b32_e32 v0, v3 2339; GFX11-NEXT: v_mov_b32_e32 v1, v4 2340; GFX11-NEXT: ; return to shader part epilog 2341; 2342; GFX12-LABEL: global_umax_saddr_i64_rtn: 2343; GFX12: ; %bb.0: 2344; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] 2345; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 2346; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2347; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 2348; GFX12-NEXT: s_mov_b64 s[0:1], 0 2349; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start 2350; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2351; GFX12-NEXT: s_wait_loadcnt 0x0 2352; GFX12-NEXT: v_mov_b32_e32 v10, v4 2353; GFX12-NEXT: v_mov_b32_e32 v9, v3 2354; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2355; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2356; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2357; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2358; GFX12-NEXT: global_wb scope:SCOPE_SYS 2359; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2360; GFX12-NEXT: s_wait_loadcnt 0x0 2361; GFX12-NEXT: global_inv scope:SCOPE_SYS 2362; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2363; GFX12-NEXT: s_wait_alu 0xfffe 2364; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2365; GFX12-NEXT: s_wait_alu 0xfffe 2366; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2367; GFX12-NEXT: s_cbranch_execnz .LBB20_1 2368; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2369; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 2370; GFX12-NEXT: v_mov_b32_e32 v0, v3 2371; GFX12-NEXT: v_mov_b32_e32 v1, v4 2372; GFX12-NEXT: ; return to shader part epilog 2373 %zext.offset = zext i32 %voffset to i64 2374 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2375 %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst 2376 %cast.rtn = bitcast i64 %rtn to <2 x float> 2377 ret <2 x float> %cast.rtn 2378} 2379 2380define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 2381; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128: 2382; GFX9: ; %bb.0: 2383; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 2384; GFX9-NEXT: v_mov_b32_e32 v6, s3 2385; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 2386; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 2387; GFX9-NEXT: s_mov_b64 s[0:1], 0 2388; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start 2389; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2390; GFX9-NEXT: s_waitcnt vmcnt(0) 2391; GFX9-NEXT: v_mov_b32_e32 v10, v4 2392; GFX9-NEXT: v_mov_b32_e32 v9, v3 2393; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2394; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2395; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2396; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 2397; GFX9-NEXT: s_waitcnt vmcnt(0) 2398; GFX9-NEXT: buffer_wbinvl1 2399; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2400; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2401; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2402; GFX9-NEXT: s_cbranch_execnz .LBB21_1 2403; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2404; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2405; GFX9-NEXT: v_mov_b32_e32 v0, v3 2406; GFX9-NEXT: v_mov_b32_e32 v1, v4 2407; GFX9-NEXT: ; return to shader part epilog 2408; 2409; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128: 2410; GFX10: ; %bb.0: 2411; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 2412; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 2413; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 2414; GFX10-NEXT: s_mov_b64 s[0:1], 0 2415; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start 2416; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2417; GFX10-NEXT: s_waitcnt vmcnt(0) 2418; GFX10-NEXT: v_mov_b32_e32 v10, v4 2419; GFX10-NEXT: v_mov_b32_e32 v9, v3 2420; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2421; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2422; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2423; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 2424; GFX10-NEXT: s_waitcnt vmcnt(0) 2425; GFX10-NEXT: buffer_gl1_inv 2426; GFX10-NEXT: buffer_gl0_inv 2427; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2428; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2429; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2430; GFX10-NEXT: s_cbranch_execnz .LBB21_1 2431; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2432; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 2433; GFX10-NEXT: v_mov_b32_e32 v0, v3 2434; GFX10-NEXT: v_mov_b32_e32 v1, v4 2435; GFX10-NEXT: ; return to shader part epilog 2436; 2437; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128: 2438; GFX11: ; %bb.0: 2439; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 2440; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 2441; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2442; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 2443; GFX11-NEXT: s_mov_b64 s[0:1], 0 2444; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2445; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start 2446; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2447; GFX11-NEXT: s_waitcnt vmcnt(0) 2448; GFX11-NEXT: v_mov_b32_e32 v10, v4 2449; GFX11-NEXT: v_mov_b32_e32 v9, v3 2450; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2451; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2452; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2453; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2454; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc 2455; GFX11-NEXT: s_waitcnt vmcnt(0) 2456; GFX11-NEXT: buffer_gl1_inv 2457; GFX11-NEXT: buffer_gl0_inv 2458; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2459; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2460; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2461; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2462; GFX11-NEXT: s_cbranch_execnz .LBB21_1 2463; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2464; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 2465; GFX11-NEXT: v_mov_b32_e32 v0, v3 2466; GFX11-NEXT: v_mov_b32_e32 v1, v4 2467; GFX11-NEXT: ; return to shader part epilog 2468; 2469; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128: 2470; GFX12: ; %bb.0: 2471; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 2472; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 2473; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2474; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 2475; GFX12-NEXT: s_mov_b64 s[0:1], 0 2476; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start 2477; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2478; GFX12-NEXT: s_wait_loadcnt 0x0 2479; GFX12-NEXT: v_mov_b32_e32 v10, v4 2480; GFX12-NEXT: v_mov_b32_e32 v9, v3 2481; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2482; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] 2483; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 2484; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 2485; GFX12-NEXT: global_wb scope:SCOPE_SYS 2486; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2487; GFX12-NEXT: s_wait_loadcnt 0x0 2488; GFX12-NEXT: global_inv scope:SCOPE_SYS 2489; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 2490; GFX12-NEXT: s_wait_alu 0xfffe 2491; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2492; GFX12-NEXT: s_wait_alu 0xfffe 2493; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2494; GFX12-NEXT: s_cbranch_execnz .LBB21_1 2495; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2496; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 2497; GFX12-NEXT: v_mov_b32_e32 v0, v3 2498; GFX12-NEXT: v_mov_b32_e32 v1, v4 2499; GFX12-NEXT: ; return to shader part epilog 2500 %zext.offset = zext i32 %voffset to i64 2501 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2502 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 2503 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst 2504 %cast.rtn = bitcast i64 %rtn to <2 x float> 2505 ret <2 x float> %cast.rtn 2506} 2507 2508define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 2509; GFX9-LABEL: global_umax_saddr_i64_nortn: 2510; GFX9: ; %bb.0: 2511; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 2512; GFX9-NEXT: v_mov_b32_e32 v3, s3 2513; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 2514; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 2515; GFX9-NEXT: s_mov_b64 s[0:1], 0 2516; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start 2517; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2518; GFX9-NEXT: s_waitcnt vmcnt(0) 2519; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2520; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2521; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2522; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 2523; GFX9-NEXT: s_waitcnt vmcnt(0) 2524; GFX9-NEXT: buffer_wbinvl1 2525; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2526; GFX9-NEXT: v_mov_b32_e32 v6, v4 2527; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2528; GFX9-NEXT: v_mov_b32_e32 v5, v3 2529; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2530; GFX9-NEXT: s_cbranch_execnz .LBB22_1 2531; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2532; GFX9-NEXT: s_endpgm 2533; 2534; GFX10-LABEL: global_umax_saddr_i64_nortn: 2535; GFX10: ; %bb.0: 2536; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 2537; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 2538; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 2539; GFX10-NEXT: s_mov_b64 s[0:1], 0 2540; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start 2541; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2542; GFX10-NEXT: s_waitcnt vmcnt(0) 2543; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2544; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2545; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2546; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 2547; GFX10-NEXT: s_waitcnt vmcnt(0) 2548; GFX10-NEXT: buffer_gl1_inv 2549; GFX10-NEXT: buffer_gl0_inv 2550; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2551; GFX10-NEXT: v_mov_b32_e32 v6, v4 2552; GFX10-NEXT: v_mov_b32_e32 v5, v3 2553; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2554; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2555; GFX10-NEXT: s_cbranch_execnz .LBB22_1 2556; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2557; GFX10-NEXT: s_endpgm 2558; 2559; GFX11-LABEL: global_umax_saddr_i64_nortn: 2560; GFX11: ; %bb.0: 2561; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] 2562; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 2563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2564; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 2565; GFX11-NEXT: s_mov_b64 s[0:1], 0 2566; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2567; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start 2568; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2569; GFX11-NEXT: s_waitcnt vmcnt(0) 2570; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2571; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2572; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2573; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc 2574; GFX11-NEXT: s_waitcnt vmcnt(0) 2575; GFX11-NEXT: buffer_gl1_inv 2576; GFX11-NEXT: buffer_gl0_inv 2577; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2578; GFX11-NEXT: v_mov_b32_e32 v6, v4 2579; GFX11-NEXT: v_mov_b32_e32 v5, v3 2580; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2581; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2582; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2583; GFX11-NEXT: s_cbranch_execnz .LBB22_1 2584; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2585; GFX11-NEXT: s_endpgm 2586; 2587; GFX12-LABEL: global_umax_saddr_i64_nortn: 2588; GFX12: ; %bb.0: 2589; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] 2590; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 2591; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2592; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 2593; GFX12-NEXT: s_mov_b64 s[0:1], 0 2594; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start 2595; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2596; GFX12-NEXT: s_wait_loadcnt 0x0 2597; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2598; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2599; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2600; GFX12-NEXT: global_wb scope:SCOPE_SYS 2601; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2602; GFX12-NEXT: s_wait_loadcnt 0x0 2603; GFX12-NEXT: global_inv scope:SCOPE_SYS 2604; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2605; GFX12-NEXT: v_mov_b32_e32 v6, v4 2606; GFX12-NEXT: v_mov_b32_e32 v5, v3 2607; GFX12-NEXT: s_wait_alu 0xfffe 2608; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2609; GFX12-NEXT: s_wait_alu 0xfffe 2610; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2611; GFX12-NEXT: s_cbranch_execnz .LBB22_1 2612; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2613; GFX12-NEXT: s_endpgm 2614 %zext.offset = zext i32 %voffset to i64 2615 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2616 %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst 2617 ret void 2618} 2619 2620define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 2621; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128: 2622; GFX9: ; %bb.0: 2623; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 2624; GFX9-NEXT: v_mov_b32_e32 v3, s3 2625; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 2626; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 2627; GFX9-NEXT: s_mov_b64 s[0:1], 0 2628; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start 2629; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2630; GFX9-NEXT: s_waitcnt vmcnt(0) 2631; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2632; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2633; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2634; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 2635; GFX9-NEXT: s_waitcnt vmcnt(0) 2636; GFX9-NEXT: buffer_wbinvl1 2637; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2638; GFX9-NEXT: v_mov_b32_e32 v6, v4 2639; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2640; GFX9-NEXT: v_mov_b32_e32 v5, v3 2641; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2642; GFX9-NEXT: s_cbranch_execnz .LBB23_1 2643; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2644; GFX9-NEXT: s_endpgm 2645; 2646; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128: 2647; GFX10: ; %bb.0: 2648; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 2649; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 2650; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 2651; GFX10-NEXT: s_mov_b64 s[0:1], 0 2652; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start 2653; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2654; GFX10-NEXT: s_waitcnt vmcnt(0) 2655; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2656; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2657; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2658; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 2659; GFX10-NEXT: s_waitcnt vmcnt(0) 2660; GFX10-NEXT: buffer_gl1_inv 2661; GFX10-NEXT: buffer_gl0_inv 2662; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2663; GFX10-NEXT: v_mov_b32_e32 v6, v4 2664; GFX10-NEXT: v_mov_b32_e32 v5, v3 2665; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2666; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2667; GFX10-NEXT: s_cbranch_execnz .LBB23_1 2668; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2669; GFX10-NEXT: s_endpgm 2670; 2671; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128: 2672; GFX11: ; %bb.0: 2673; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 2674; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 2675; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2676; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 2677; GFX11-NEXT: s_mov_b64 s[0:1], 0 2678; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2679; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start 2680; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2681; GFX11-NEXT: s_waitcnt vmcnt(0) 2682; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2683; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2684; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2685; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc 2686; GFX11-NEXT: s_waitcnt vmcnt(0) 2687; GFX11-NEXT: buffer_gl1_inv 2688; GFX11-NEXT: buffer_gl0_inv 2689; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2690; GFX11-NEXT: v_mov_b32_e32 v6, v4 2691; GFX11-NEXT: v_mov_b32_e32 v5, v3 2692; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2693; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2694; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2695; GFX11-NEXT: s_cbranch_execnz .LBB23_1 2696; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2697; GFX11-NEXT: s_endpgm 2698; 2699; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128: 2700; GFX12: ; %bb.0: 2701; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 2702; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 2703; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2704; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 2705; GFX12-NEXT: s_mov_b64 s[0:1], 0 2706; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start 2707; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2708; GFX12-NEXT: s_wait_loadcnt 0x0 2709; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] 2710; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 2711; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 2712; GFX12-NEXT: global_wb scope:SCOPE_SYS 2713; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2714; GFX12-NEXT: s_wait_loadcnt 0x0 2715; GFX12-NEXT: global_inv scope:SCOPE_SYS 2716; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 2717; GFX12-NEXT: v_mov_b32_e32 v6, v4 2718; GFX12-NEXT: v_mov_b32_e32 v5, v3 2719; GFX12-NEXT: s_wait_alu 0xfffe 2720; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2721; GFX12-NEXT: s_wait_alu 0xfffe 2722; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2723; GFX12-NEXT: s_cbranch_execnz .LBB23_1 2724; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2725; GFX12-NEXT: s_endpgm 2726 %zext.offset = zext i32 %voffset to i64 2727 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2728 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 2729 %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst 2730 ret void 2731} 2732 2733; -------------------------------------------------------------------------------- 2734; atomicrmw umin 2735; -------------------------------------------------------------------------------- 2736 2737define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 2738; GFX9-LABEL: global_umin_saddr_i32_rtn: 2739; GFX9: ; %bb.0: 2740; GFX9-NEXT: v_mov_b32_e32 v2, v0 2741; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2742; GFX9-NEXT: v_mov_b32_e32 v3, s3 2743; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 2744; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2745; GFX9-NEXT: s_mov_b64 s[0:1], 0 2746; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start 2747; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2748; GFX9-NEXT: s_waitcnt vmcnt(0) 2749; GFX9-NEXT: v_mov_b32_e32 v5, v0 2750; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 2751; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 2752; GFX9-NEXT: s_waitcnt vmcnt(0) 2753; GFX9-NEXT: buffer_wbinvl1 2754; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2755; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2756; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2757; GFX9-NEXT: s_cbranch_execnz .LBB24_1 2758; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2759; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2760; GFX9-NEXT: ; return to shader part epilog 2761; 2762; GFX10-LABEL: global_umin_saddr_i32_rtn: 2763; GFX10: ; %bb.0: 2764; GFX10-NEXT: v_mov_b32_e32 v2, v0 2765; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2766; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2767; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 2768; GFX10-NEXT: s_mov_b64 s[0:1], 0 2769; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start 2770; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2771; GFX10-NEXT: s_waitcnt vmcnt(0) 2772; GFX10-NEXT: v_mov_b32_e32 v5, v0 2773; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 2774; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 2775; GFX10-NEXT: s_waitcnt vmcnt(0) 2776; GFX10-NEXT: buffer_gl1_inv 2777; GFX10-NEXT: buffer_gl0_inv 2778; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2779; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2780; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2781; GFX10-NEXT: s_cbranch_execnz .LBB24_1 2782; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2783; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 2784; GFX10-NEXT: ; return to shader part epilog 2785; 2786; GFX11-LABEL: global_umin_saddr_i32_rtn: 2787; GFX11: ; %bb.0: 2788; GFX11-NEXT: v_mov_b32_e32 v2, v0 2789; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2790; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2791; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2792; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2793; GFX11-NEXT: s_mov_b64 s[0:1], 0 2794; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2795; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start 2796; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2797; GFX11-NEXT: s_waitcnt vmcnt(0) 2798; GFX11-NEXT: v_mov_b32_e32 v5, v0 2799; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2800; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 2801; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 2802; GFX11-NEXT: s_waitcnt vmcnt(0) 2803; GFX11-NEXT: buffer_gl1_inv 2804; GFX11-NEXT: buffer_gl0_inv 2805; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2806; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2807; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2808; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2809; GFX11-NEXT: s_cbranch_execnz .LBB24_1 2810; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2811; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 2812; GFX11-NEXT: ; return to shader part epilog 2813; 2814; GFX12-LABEL: global_umin_saddr_i32_rtn: 2815; GFX12: ; %bb.0: 2816; GFX12-NEXT: v_mov_b32_e32 v2, v0 2817; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] 2818; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2819; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2820; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2821; GFX12-NEXT: s_mov_b64 s[0:1], 0 2822; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start 2823; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2824; GFX12-NEXT: s_wait_loadcnt 0x0 2825; GFX12-NEXT: v_mov_b32_e32 v5, v0 2826; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2827; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 2828; GFX12-NEXT: global_wb scope:SCOPE_SYS 2829; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2830; GFX12-NEXT: s_wait_loadcnt 0x0 2831; GFX12-NEXT: global_inv scope:SCOPE_SYS 2832; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2833; GFX12-NEXT: s_wait_alu 0xfffe 2834; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2835; GFX12-NEXT: s_wait_alu 0xfffe 2836; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2837; GFX12-NEXT: s_cbranch_execnz .LBB24_1 2838; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2839; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 2840; GFX12-NEXT: ; return to shader part epilog 2841 %zext.offset = zext i32 %voffset to i64 2842 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2843 %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst 2844 %cast.rtn = bitcast i32 %rtn to float 2845 ret float %cast.rtn 2846} 2847 2848define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 2849; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128: 2850; GFX9: ; %bb.0: 2851; GFX9-NEXT: v_mov_b32_e32 v2, v0 2852; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 2853; GFX9-NEXT: v_mov_b32_e32 v3, s3 2854; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 2855; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2856; GFX9-NEXT: s_mov_b64 s[0:1], 0 2857; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start 2858; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2859; GFX9-NEXT: s_waitcnt vmcnt(0) 2860; GFX9-NEXT: v_mov_b32_e32 v5, v0 2861; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 2862; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 2863; GFX9-NEXT: s_waitcnt vmcnt(0) 2864; GFX9-NEXT: buffer_wbinvl1 2865; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2866; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2867; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2868; GFX9-NEXT: s_cbranch_execnz .LBB25_1 2869; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2870; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2871; GFX9-NEXT: ; return to shader part epilog 2872; 2873; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: 2874; GFX10: ; %bb.0: 2875; GFX10-NEXT: v_mov_b32_e32 v2, v0 2876; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 2877; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2878; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 2879; GFX10-NEXT: s_mov_b64 s[0:1], 0 2880; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start 2881; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2882; GFX10-NEXT: s_waitcnt vmcnt(0) 2883; GFX10-NEXT: v_mov_b32_e32 v5, v0 2884; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 2885; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 2886; GFX10-NEXT: s_waitcnt vmcnt(0) 2887; GFX10-NEXT: buffer_gl1_inv 2888; GFX10-NEXT: buffer_gl0_inv 2889; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2890; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2891; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 2892; GFX10-NEXT: s_cbranch_execnz .LBB25_1 2893; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2894; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 2895; GFX10-NEXT: ; return to shader part epilog 2896; 2897; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: 2898; GFX11: ; %bb.0: 2899; GFX11-NEXT: v_mov_b32_e32 v2, v0 2900; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 2901; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2902; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2903; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2904; GFX11-NEXT: s_mov_b64 s[0:1], 0 2905; GFX11-NEXT: s_waitcnt_depctr 0xfffe 2906; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start 2907; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2908; GFX11-NEXT: s_waitcnt vmcnt(0) 2909; GFX11-NEXT: v_mov_b32_e32 v5, v0 2910; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2911; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 2912; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 2913; GFX11-NEXT: s_waitcnt vmcnt(0) 2914; GFX11-NEXT: buffer_gl1_inv 2915; GFX11-NEXT: buffer_gl0_inv 2916; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2917; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2918; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2919; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2920; GFX11-NEXT: s_cbranch_execnz .LBB25_1 2921; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2922; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 2923; GFX11-NEXT: ; return to shader part epilog 2924; 2925; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128: 2926; GFX12: ; %bb.0: 2927; GFX12-NEXT: v_mov_b32_e32 v2, v0 2928; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 2929; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 2930; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2931; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 2932; GFX12-NEXT: s_mov_b64 s[0:1], 0 2933; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start 2934; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2935; GFX12-NEXT: s_wait_loadcnt 0x0 2936; GFX12-NEXT: v_mov_b32_e32 v5, v0 2937; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2938; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 2939; GFX12-NEXT: global_wb scope:SCOPE_SYS 2940; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2941; GFX12-NEXT: s_wait_loadcnt 0x0 2942; GFX12-NEXT: global_inv scope:SCOPE_SYS 2943; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2944; GFX12-NEXT: s_wait_alu 0xfffe 2945; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2946; GFX12-NEXT: s_wait_alu 0xfffe 2947; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 2948; GFX12-NEXT: s_cbranch_execnz .LBB25_1 2949; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2950; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 2951; GFX12-NEXT: ; return to shader part epilog 2952 %zext.offset = zext i32 %voffset to i64 2953 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 2954 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 2955 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst 2956 %cast.rtn = bitcast i32 %rtn to float 2957 ret float %cast.rtn 2958} 2959 2960define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 2961; GFX9-LABEL: global_umin_saddr_i32_nortn: 2962; GFX9: ; %bb.0: 2963; GFX9-NEXT: global_load_dword v5, v0, s[2:3] 2964; GFX9-NEXT: v_mov_b32_e32 v3, s3 2965; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 2966; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2967; GFX9-NEXT: s_mov_b64 s[0:1], 0 2968; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start 2969; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2970; GFX9-NEXT: s_waitcnt vmcnt(0) 2971; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 2972; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 2973; GFX9-NEXT: s_waitcnt vmcnt(0) 2974; GFX9-NEXT: buffer_wbinvl1 2975; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2976; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2977; GFX9-NEXT: v_mov_b32_e32 v5, v0 2978; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 2979; GFX9-NEXT: s_cbranch_execnz .LBB26_1 2980; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2981; GFX9-NEXT: s_endpgm 2982; 2983; GFX10-LABEL: global_umin_saddr_i32_nortn: 2984; GFX10: ; %bb.0: 2985; GFX10-NEXT: global_load_dword v5, v0, s[2:3] 2986; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 2987; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 2988; GFX10-NEXT: s_mov_b64 s[0:1], 0 2989; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start 2990; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2991; GFX10-NEXT: s_waitcnt vmcnt(0) 2992; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 2993; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc 2994; GFX10-NEXT: s_waitcnt vmcnt(0) 2995; GFX10-NEXT: buffer_gl1_inv 2996; GFX10-NEXT: buffer_gl0_inv 2997; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 2998; GFX10-NEXT: v_mov_b32_e32 v5, v0 2999; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3000; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 3001; GFX10-NEXT: s_cbranch_execnz .LBB26_1 3002; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3003; GFX10-NEXT: s_endpgm 3004; 3005; GFX11-LABEL: global_umin_saddr_i32_nortn: 3006; GFX11: ; %bb.0: 3007; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] 3008; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 3009; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3010; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 3011; GFX11-NEXT: s_mov_b64 s[0:1], 0 3012; GFX11-NEXT: s_waitcnt_depctr 0xfffe 3013; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start 3014; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3015; GFX11-NEXT: s_waitcnt vmcnt(0) 3016; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 3017; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc 3018; GFX11-NEXT: s_waitcnt vmcnt(0) 3019; GFX11-NEXT: buffer_gl1_inv 3020; GFX11-NEXT: buffer_gl0_inv 3021; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 3022; GFX11-NEXT: v_mov_b32_e32 v5, v0 3023; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3024; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3025; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3026; GFX11-NEXT: s_cbranch_execnz .LBB26_1 3027; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3028; GFX11-NEXT: s_endpgm 3029; 3030; GFX12-LABEL: global_umin_saddr_i32_nortn: 3031; GFX12: ; %bb.0: 3032; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] 3033; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 3034; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3035; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 3036; GFX12-NEXT: s_mov_b64 s[0:1], 0 3037; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start 3038; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3039; GFX12-NEXT: s_wait_loadcnt 0x0 3040; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 3041; GFX12-NEXT: global_wb scope:SCOPE_SYS 3042; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3043; GFX12-NEXT: s_wait_loadcnt 0x0 3044; GFX12-NEXT: global_inv scope:SCOPE_SYS 3045; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 3046; GFX12-NEXT: v_mov_b32_e32 v5, v0 3047; GFX12-NEXT: s_wait_alu 0xfffe 3048; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3049; GFX12-NEXT: s_wait_alu 0xfffe 3050; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3051; GFX12-NEXT: s_cbranch_execnz .LBB26_1 3052; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3053; GFX12-NEXT: s_endpgm 3054 %zext.offset = zext i32 %voffset to i64 3055 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 3056 %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst 3057 ret void 3058} 3059 3060define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { 3061; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128: 3062; GFX9: ; %bb.0: 3063; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 3064; GFX9-NEXT: v_mov_b32_e32 v3, s3 3065; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 3066; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 3067; GFX9-NEXT: s_mov_b64 s[0:1], 0 3068; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start 3069; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3070; GFX9-NEXT: s_waitcnt vmcnt(0) 3071; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 3072; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 3073; GFX9-NEXT: s_waitcnt vmcnt(0) 3074; GFX9-NEXT: buffer_wbinvl1 3075; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 3076; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3077; GFX9-NEXT: v_mov_b32_e32 v5, v0 3078; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 3079; GFX9-NEXT: s_cbranch_execnz .LBB27_1 3080; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3081; GFX9-NEXT: s_endpgm 3082; 3083; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128: 3084; GFX10: ; %bb.0: 3085; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 3086; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 3087; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] 3088; GFX10-NEXT: s_mov_b64 s[0:1], 0 3089; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start 3090; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3091; GFX10-NEXT: s_waitcnt vmcnt(0) 3092; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 3093; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc 3094; GFX10-NEXT: s_waitcnt vmcnt(0) 3095; GFX10-NEXT: buffer_gl1_inv 3096; GFX10-NEXT: buffer_gl0_inv 3097; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 3098; GFX10-NEXT: v_mov_b32_e32 v5, v0 3099; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3100; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 3101; GFX10-NEXT: s_cbranch_execnz .LBB27_1 3102; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3103; GFX10-NEXT: s_endpgm 3104; 3105; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128: 3106; GFX11: ; %bb.0: 3107; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 3108; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 3109; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3110; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 3111; GFX11-NEXT: s_mov_b64 s[0:1], 0 3112; GFX11-NEXT: s_waitcnt_depctr 0xfffe 3113; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start 3114; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3115; GFX11-NEXT: s_waitcnt vmcnt(0) 3116; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 3117; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc 3118; GFX11-NEXT: s_waitcnt vmcnt(0) 3119; GFX11-NEXT: buffer_gl1_inv 3120; GFX11-NEXT: buffer_gl0_inv 3121; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 3122; GFX11-NEXT: v_mov_b32_e32 v5, v0 3123; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3124; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3125; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3126; GFX11-NEXT: s_cbranch_execnz .LBB27_1 3127; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3128; GFX11-NEXT: s_endpgm 3129; 3130; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128: 3131; GFX12: ; %bb.0: 3132; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 3133; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 3134; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3135; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] 3136; GFX12-NEXT: s_mov_b64 s[0:1], 0 3137; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start 3138; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3139; GFX12-NEXT: s_wait_loadcnt 0x0 3140; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 3141; GFX12-NEXT: global_wb scope:SCOPE_SYS 3142; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3143; GFX12-NEXT: s_wait_loadcnt 0x0 3144; GFX12-NEXT: global_inv scope:SCOPE_SYS 3145; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 3146; GFX12-NEXT: v_mov_b32_e32 v5, v0 3147; GFX12-NEXT: s_wait_alu 0xfffe 3148; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3149; GFX12-NEXT: s_wait_alu 0xfffe 3150; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3151; GFX12-NEXT: s_cbranch_execnz .LBB27_1 3152; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3153; GFX12-NEXT: s_endpgm 3154 %zext.offset = zext i32 %voffset to i64 3155 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 3156 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 3157 %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst 3158 ret void 3159} 3160 3161define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 3162; GFX9-LABEL: global_umin_saddr_i64_rtn: 3163; GFX9: ; %bb.0: 3164; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 3165; GFX9-NEXT: v_mov_b32_e32 v6, s3 3166; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 3167; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 3168; GFX9-NEXT: s_mov_b64 s[0:1], 0 3169; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start 3170; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3171; GFX9-NEXT: s_waitcnt vmcnt(0) 3172; GFX9-NEXT: v_mov_b32_e32 v10, v4 3173; GFX9-NEXT: v_mov_b32_e32 v9, v3 3174; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3175; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3176; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3177; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 3178; GFX9-NEXT: s_waitcnt vmcnt(0) 3179; GFX9-NEXT: buffer_wbinvl1 3180; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3181; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3182; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 3183; GFX9-NEXT: s_cbranch_execnz .LBB28_1 3184; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3185; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 3186; GFX9-NEXT: v_mov_b32_e32 v0, v3 3187; GFX9-NEXT: v_mov_b32_e32 v1, v4 3188; GFX9-NEXT: ; return to shader part epilog 3189; 3190; GFX10-LABEL: global_umin_saddr_i64_rtn: 3191; GFX10: ; %bb.0: 3192; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] 3193; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 3194; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 3195; GFX10-NEXT: s_mov_b64 s[0:1], 0 3196; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start 3197; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3198; GFX10-NEXT: s_waitcnt vmcnt(0) 3199; GFX10-NEXT: v_mov_b32_e32 v10, v4 3200; GFX10-NEXT: v_mov_b32_e32 v9, v3 3201; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3202; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3203; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3204; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc 3205; GFX10-NEXT: s_waitcnt vmcnt(0) 3206; GFX10-NEXT: buffer_gl1_inv 3207; GFX10-NEXT: buffer_gl0_inv 3208; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3209; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3210; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 3211; GFX10-NEXT: s_cbranch_execnz .LBB28_1 3212; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3213; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 3214; GFX10-NEXT: v_mov_b32_e32 v0, v3 3215; GFX10-NEXT: v_mov_b32_e32 v1, v4 3216; GFX10-NEXT: ; return to shader part epilog 3217; 3218; GFX11-LABEL: global_umin_saddr_i64_rtn: 3219; GFX11: ; %bb.0: 3220; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] 3221; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 3222; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3223; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 3224; GFX11-NEXT: s_mov_b64 s[0:1], 0 3225; GFX11-NEXT: s_waitcnt_depctr 0xfffe 3226; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start 3227; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3228; GFX11-NEXT: s_waitcnt vmcnt(0) 3229; GFX11-NEXT: v_mov_b32_e32 v10, v4 3230; GFX11-NEXT: v_mov_b32_e32 v9, v3 3231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3232; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3233; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3234; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3235; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc 3236; GFX11-NEXT: s_waitcnt vmcnt(0) 3237; GFX11-NEXT: buffer_gl1_inv 3238; GFX11-NEXT: buffer_gl0_inv 3239; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3240; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3241; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3242; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3243; GFX11-NEXT: s_cbranch_execnz .LBB28_1 3244; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3245; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 3246; GFX11-NEXT: v_mov_b32_e32 v0, v3 3247; GFX11-NEXT: v_mov_b32_e32 v1, v4 3248; GFX11-NEXT: ; return to shader part epilog 3249; 3250; GFX12-LABEL: global_umin_saddr_i64_rtn: 3251; GFX12: ; %bb.0: 3252; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] 3253; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 3254; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3255; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 3256; GFX12-NEXT: s_mov_b64 s[0:1], 0 3257; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start 3258; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3259; GFX12-NEXT: s_wait_loadcnt 0x0 3260; GFX12-NEXT: v_mov_b32_e32 v10, v4 3261; GFX12-NEXT: v_mov_b32_e32 v9, v3 3262; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3263; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3264; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3265; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3266; GFX12-NEXT: global_wb scope:SCOPE_SYS 3267; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3268; GFX12-NEXT: s_wait_loadcnt 0x0 3269; GFX12-NEXT: global_inv scope:SCOPE_SYS 3270; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3271; GFX12-NEXT: s_wait_alu 0xfffe 3272; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3273; GFX12-NEXT: s_wait_alu 0xfffe 3274; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3275; GFX12-NEXT: s_cbranch_execnz .LBB28_1 3276; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3277; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 3278; GFX12-NEXT: v_mov_b32_e32 v0, v3 3279; GFX12-NEXT: v_mov_b32_e32 v1, v4 3280; GFX12-NEXT: ; return to shader part epilog 3281 %zext.offset = zext i32 %voffset to i64 3282 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 3283 %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst 3284 %cast.rtn = bitcast i64 %rtn to <2 x float> 3285 ret <2 x float> %cast.rtn 3286} 3287 3288define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 3289; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128: 3290; GFX9: ; %bb.0: 3291; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 3292; GFX9-NEXT: v_mov_b32_e32 v6, s3 3293; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 3294; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 3295; GFX9-NEXT: s_mov_b64 s[0:1], 0 3296; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start 3297; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3298; GFX9-NEXT: s_waitcnt vmcnt(0) 3299; GFX9-NEXT: v_mov_b32_e32 v10, v4 3300; GFX9-NEXT: v_mov_b32_e32 v9, v3 3301; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3302; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3303; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3304; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 3305; GFX9-NEXT: s_waitcnt vmcnt(0) 3306; GFX9-NEXT: buffer_wbinvl1 3307; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3308; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3309; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 3310; GFX9-NEXT: s_cbranch_execnz .LBB29_1 3311; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3312; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 3313; GFX9-NEXT: v_mov_b32_e32 v0, v3 3314; GFX9-NEXT: v_mov_b32_e32 v1, v4 3315; GFX9-NEXT: ; return to shader part epilog 3316; 3317; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128: 3318; GFX10: ; %bb.0: 3319; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 3320; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 3321; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] 3322; GFX10-NEXT: s_mov_b64 s[0:1], 0 3323; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start 3324; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3325; GFX10-NEXT: s_waitcnt vmcnt(0) 3326; GFX10-NEXT: v_mov_b32_e32 v10, v4 3327; GFX10-NEXT: v_mov_b32_e32 v9, v3 3328; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3329; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3330; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3331; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc 3332; GFX10-NEXT: s_waitcnt vmcnt(0) 3333; GFX10-NEXT: buffer_gl1_inv 3334; GFX10-NEXT: buffer_gl0_inv 3335; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3336; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3337; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 3338; GFX10-NEXT: s_cbranch_execnz .LBB29_1 3339; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3340; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] 3341; GFX10-NEXT: v_mov_b32_e32 v0, v3 3342; GFX10-NEXT: v_mov_b32_e32 v1, v4 3343; GFX10-NEXT: ; return to shader part epilog 3344; 3345; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128: 3346; GFX11: ; %bb.0: 3347; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 3348; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 3349; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3350; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 3351; GFX11-NEXT: s_mov_b64 s[0:1], 0 3352; GFX11-NEXT: s_waitcnt_depctr 0xfffe 3353; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start 3354; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3355; GFX11-NEXT: s_waitcnt vmcnt(0) 3356; GFX11-NEXT: v_mov_b32_e32 v10, v4 3357; GFX11-NEXT: v_mov_b32_e32 v9, v3 3358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3359; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3360; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3361; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3362; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc 3363; GFX11-NEXT: s_waitcnt vmcnt(0) 3364; GFX11-NEXT: buffer_gl1_inv 3365; GFX11-NEXT: buffer_gl0_inv 3366; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3367; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3368; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3369; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3370; GFX11-NEXT: s_cbranch_execnz .LBB29_1 3371; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3372; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] 3373; GFX11-NEXT: v_mov_b32_e32 v0, v3 3374; GFX11-NEXT: v_mov_b32_e32 v1, v4 3375; GFX11-NEXT: ; return to shader part epilog 3376; 3377; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128: 3378; GFX12: ; %bb.0: 3379; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 3380; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 3381; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3382; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] 3383; GFX12-NEXT: s_mov_b64 s[0:1], 0 3384; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start 3385; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3386; GFX12-NEXT: s_wait_loadcnt 0x0 3387; GFX12-NEXT: v_mov_b32_e32 v10, v4 3388; GFX12-NEXT: v_mov_b32_e32 v9, v3 3389; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3390; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] 3391; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc 3392; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc 3393; GFX12-NEXT: global_wb scope:SCOPE_SYS 3394; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3395; GFX12-NEXT: s_wait_loadcnt 0x0 3396; GFX12-NEXT: global_inv scope:SCOPE_SYS 3397; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] 3398; GFX12-NEXT: s_wait_alu 0xfffe 3399; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3400; GFX12-NEXT: s_wait_alu 0xfffe 3401; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3402; GFX12-NEXT: s_cbranch_execnz .LBB29_1 3403; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3404; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] 3405; GFX12-NEXT: v_mov_b32_e32 v0, v3 3406; GFX12-NEXT: v_mov_b32_e32 v1, v4 3407; GFX12-NEXT: ; return to shader part epilog 3408 %zext.offset = zext i32 %voffset to i64 3409 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 3410 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 3411 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst 3412 %cast.rtn = bitcast i64 %rtn to <2 x float> 3413 ret <2 x float> %cast.rtn 3414} 3415 3416define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 3417; GFX9-LABEL: global_umin_saddr_i64_nortn: 3418; GFX9: ; %bb.0: 3419; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 3420; GFX9-NEXT: v_mov_b32_e32 v3, s3 3421; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 3422; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 3423; GFX9-NEXT: s_mov_b64 s[0:1], 0 3424; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start 3425; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3426; GFX9-NEXT: s_waitcnt vmcnt(0) 3427; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3428; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3429; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3430; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 3431; GFX9-NEXT: s_waitcnt vmcnt(0) 3432; GFX9-NEXT: buffer_wbinvl1 3433; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3434; GFX9-NEXT: v_mov_b32_e32 v6, v4 3435; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3436; GFX9-NEXT: v_mov_b32_e32 v5, v3 3437; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 3438; GFX9-NEXT: s_cbranch_execnz .LBB30_1 3439; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3440; GFX9-NEXT: s_endpgm 3441; 3442; GFX10-LABEL: global_umin_saddr_i64_nortn: 3443; GFX10: ; %bb.0: 3444; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] 3445; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 3446; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 3447; GFX10-NEXT: s_mov_b64 s[0:1], 0 3448; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start 3449; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3450; GFX10-NEXT: s_waitcnt vmcnt(0) 3451; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3452; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3453; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3454; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc 3455; GFX10-NEXT: s_waitcnt vmcnt(0) 3456; GFX10-NEXT: buffer_gl1_inv 3457; GFX10-NEXT: buffer_gl0_inv 3458; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3459; GFX10-NEXT: v_mov_b32_e32 v6, v4 3460; GFX10-NEXT: v_mov_b32_e32 v5, v3 3461; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3462; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 3463; GFX10-NEXT: s_cbranch_execnz .LBB30_1 3464; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3465; GFX10-NEXT: s_endpgm 3466; 3467; GFX11-LABEL: global_umin_saddr_i64_nortn: 3468; GFX11: ; %bb.0: 3469; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] 3470; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 3471; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3472; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 3473; GFX11-NEXT: s_mov_b64 s[0:1], 0 3474; GFX11-NEXT: s_waitcnt_depctr 0xfffe 3475; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start 3476; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3477; GFX11-NEXT: s_waitcnt vmcnt(0) 3478; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3479; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3480; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3481; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc 3482; GFX11-NEXT: s_waitcnt vmcnt(0) 3483; GFX11-NEXT: buffer_gl1_inv 3484; GFX11-NEXT: buffer_gl0_inv 3485; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3486; GFX11-NEXT: v_mov_b32_e32 v6, v4 3487; GFX11-NEXT: v_mov_b32_e32 v5, v3 3488; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3489; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3490; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3491; GFX11-NEXT: s_cbranch_execnz .LBB30_1 3492; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3493; GFX11-NEXT: s_endpgm 3494; 3495; GFX12-LABEL: global_umin_saddr_i64_nortn: 3496; GFX12: ; %bb.0: 3497; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] 3498; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 3499; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3500; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 3501; GFX12-NEXT: s_mov_b64 s[0:1], 0 3502; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start 3503; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3504; GFX12-NEXT: s_wait_loadcnt 0x0 3505; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3506; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3507; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3508; GFX12-NEXT: global_wb scope:SCOPE_SYS 3509; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3510; GFX12-NEXT: s_wait_loadcnt 0x0 3511; GFX12-NEXT: global_inv scope:SCOPE_SYS 3512; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3513; GFX12-NEXT: v_mov_b32_e32 v6, v4 3514; GFX12-NEXT: v_mov_b32_e32 v5, v3 3515; GFX12-NEXT: s_wait_alu 0xfffe 3516; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3517; GFX12-NEXT: s_wait_alu 0xfffe 3518; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3519; GFX12-NEXT: s_cbranch_execnz .LBB30_1 3520; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3521; GFX12-NEXT: s_endpgm 3522 %zext.offset = zext i32 %voffset to i64 3523 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 3524 %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst 3525 ret void 3526} 3527 3528define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { 3529; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128: 3530; GFX9: ; %bb.0: 3531; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 3532; GFX9-NEXT: v_mov_b32_e32 v3, s3 3533; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 3534; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc 3535; GFX9-NEXT: s_mov_b64 s[0:1], 0 3536; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start 3537; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3538; GFX9-NEXT: s_waitcnt vmcnt(0) 3539; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3540; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3541; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3542; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 3543; GFX9-NEXT: s_waitcnt vmcnt(0) 3544; GFX9-NEXT: buffer_wbinvl1 3545; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3546; GFX9-NEXT: v_mov_b32_e32 v6, v4 3547; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3548; GFX9-NEXT: v_mov_b32_e32 v5, v3 3549; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] 3550; GFX9-NEXT: s_cbranch_execnz .LBB31_1 3551; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3552; GFX9-NEXT: s_endpgm 3553; 3554; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128: 3555; GFX10: ; %bb.0: 3556; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 3557; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 3558; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] 3559; GFX10-NEXT: s_mov_b64 s[0:1], 0 3560; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start 3561; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3562; GFX10-NEXT: s_waitcnt vmcnt(0) 3563; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3564; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3565; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3566; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc 3567; GFX10-NEXT: s_waitcnt vmcnt(0) 3568; GFX10-NEXT: buffer_gl1_inv 3569; GFX10-NEXT: buffer_gl0_inv 3570; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3571; GFX10-NEXT: v_mov_b32_e32 v6, v4 3572; GFX10-NEXT: v_mov_b32_e32 v5, v3 3573; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3574; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] 3575; GFX10-NEXT: s_cbranch_execnz .LBB31_1 3576; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3577; GFX10-NEXT: s_endpgm 3578; 3579; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128: 3580; GFX11: ; %bb.0: 3581; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 3582; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 3583; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3584; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 3585; GFX11-NEXT: s_mov_b64 s[0:1], 0 3586; GFX11-NEXT: s_waitcnt_depctr 0xfffe 3587; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start 3588; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3589; GFX11-NEXT: s_waitcnt vmcnt(0) 3590; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3591; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3592; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3593; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc 3594; GFX11-NEXT: s_waitcnt vmcnt(0) 3595; GFX11-NEXT: buffer_gl1_inv 3596; GFX11-NEXT: buffer_gl0_inv 3597; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3598; GFX11-NEXT: v_mov_b32_e32 v6, v4 3599; GFX11-NEXT: v_mov_b32_e32 v5, v3 3600; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3601; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3602; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3603; GFX11-NEXT: s_cbranch_execnz .LBB31_1 3604; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3605; GFX11-NEXT: s_endpgm 3606; 3607; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128: 3608; GFX12: ; %bb.0: 3609; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 3610; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 3611; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3612; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] 3613; GFX12-NEXT: s_mov_b64 s[0:1], 0 3614; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start 3615; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3616; GFX12-NEXT: s_wait_loadcnt 0x0 3617; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] 3618; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 3619; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc 3620; GFX12-NEXT: global_wb scope:SCOPE_SYS 3621; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3622; GFX12-NEXT: s_wait_loadcnt 0x0 3623; GFX12-NEXT: global_inv scope:SCOPE_SYS 3624; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] 3625; GFX12-NEXT: v_mov_b32_e32 v6, v4 3626; GFX12-NEXT: v_mov_b32_e32 v5, v3 3627; GFX12-NEXT: s_wait_alu 0xfffe 3628; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3629; GFX12-NEXT: s_wait_alu 0xfffe 3630; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] 3631; GFX12-NEXT: s_cbranch_execnz .LBB31_1 3632; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3633; GFX12-NEXT: s_endpgm 3634 %zext.offset = zext i32 %voffset to i64 3635 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset 3636 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 3637 %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst 3638 ret void 3639} 3640 3641attributes #0 = { argmemonly nounwind willreturn } 3642