1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5 6; --------------------------------------------------------------------- 7; atomicrmw xchg 8; --------------------------------------------------------------------- 9 10define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 11; SI-LABEL: global_atomic_xchg_i32_noret: 12; SI: ; %bb.0: 13; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s6, 0 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s4, s6 17; SI-NEXT: s_mov_b32 s5, s6 18; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 19; SI-NEXT: s_waitcnt vmcnt(0) 20; SI-NEXT: buffer_wbinvl1 21; SI-NEXT: s_waitcnt expcnt(0) 22; SI-NEXT: s_setpc_b64 s[30:31] 23; 24; VI-LABEL: global_atomic_xchg_i32_noret: 25; VI: ; %bb.0: 26; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; VI-NEXT: flat_atomic_swap v[0:1], v2 28; VI-NEXT: s_waitcnt vmcnt(0) 29; VI-NEXT: buffer_wbinvl1_vol 30; VI-NEXT: s_setpc_b64 s[30:31] 31; 32; GFX9-LABEL: global_atomic_xchg_i32_noret: 33; GFX9: ; %bb.0: 34; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35; GFX9-NEXT: global_atomic_swap v[0:1], v2, off 36; GFX9-NEXT: s_waitcnt vmcnt(0) 37; GFX9-NEXT: buffer_wbinvl1_vol 38; GFX9-NEXT: s_setpc_b64 s[30:31] 39 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst 40 ret void 41} 42 43define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 44; SI-LABEL: global_atomic_xchg_i32_noret_offset: 45; SI: ; %bb.0: 46; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; SI-NEXT: s_mov_b32 s6, 0 48; SI-NEXT: s_mov_b32 s7, 0xf000 49; SI-NEXT: s_mov_b32 s4, s6 50; SI-NEXT: s_mov_b32 s5, s6 51; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_wbinvl1 54; SI-NEXT: s_waitcnt expcnt(0) 55; SI-NEXT: s_setpc_b64 s[30:31] 56; 57; VI-LABEL: global_atomic_xchg_i32_noret_offset: 58; VI: ; %bb.0: 59; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 61; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 62; VI-NEXT: flat_atomic_swap v[0:1], v2 63; VI-NEXT: s_waitcnt vmcnt(0) 64; VI-NEXT: buffer_wbinvl1_vol 65; VI-NEXT: s_setpc_b64 s[30:31] 66; 67; GFX9-LABEL: global_atomic_xchg_i32_noret_offset: 68; GFX9: ; %bb.0: 69; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 71; GFX9-NEXT: s_waitcnt vmcnt(0) 72; GFX9-NEXT: buffer_wbinvl1_vol 73; GFX9-NEXT: s_setpc_b64 s[30:31] 74 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 75 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst 76 ret void 77} 78 79define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 80; SI-LABEL: global_atomic_xchg_i32_ret: 81; SI: ; %bb.0: 82; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; SI-NEXT: s_mov_b32 s6, 0 84; SI-NEXT: s_mov_b32 s7, 0xf000 85; SI-NEXT: s_mov_b32 s4, s6 86; SI-NEXT: s_mov_b32 s5, s6 87; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc 88; SI-NEXT: s_waitcnt vmcnt(0) 89; SI-NEXT: buffer_wbinvl1 90; SI-NEXT: v_mov_b32_e32 v0, v2 91; SI-NEXT: s_waitcnt expcnt(0) 92; SI-NEXT: s_setpc_b64 s[30:31] 93; 94; VI-LABEL: global_atomic_xchg_i32_ret: 95; VI: ; %bb.0: 96; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 97; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 98; VI-NEXT: s_waitcnt vmcnt(0) 99; VI-NEXT: buffer_wbinvl1_vol 100; VI-NEXT: s_setpc_b64 s[30:31] 101; 102; GFX9-LABEL: global_atomic_xchg_i32_ret: 103; GFX9: ; %bb.0: 104; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc 106; GFX9-NEXT: s_waitcnt vmcnt(0) 107; GFX9-NEXT: buffer_wbinvl1_vol 108; GFX9-NEXT: s_setpc_b64 s[30:31] 109 %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst 110 ret i32 %result 111} 112 113define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 114; SI-LABEL: global_atomic_xchg_i32_ret_offset: 115; SI: ; %bb.0: 116; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; SI-NEXT: s_mov_b32 s6, 0 118; SI-NEXT: s_mov_b32 s7, 0xf000 119; SI-NEXT: s_mov_b32 s4, s6 120; SI-NEXT: s_mov_b32 s5, s6 121; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 122; SI-NEXT: s_waitcnt vmcnt(0) 123; SI-NEXT: buffer_wbinvl1 124; SI-NEXT: v_mov_b32_e32 v0, v2 125; SI-NEXT: s_waitcnt expcnt(0) 126; SI-NEXT: s_setpc_b64 s[30:31] 127; 128; VI-LABEL: global_atomic_xchg_i32_ret_offset: 129; VI: ; %bb.0: 130; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 131; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 132; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 133; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 134; VI-NEXT: s_waitcnt vmcnt(0) 135; VI-NEXT: buffer_wbinvl1_vol 136; VI-NEXT: s_setpc_b64 s[30:31] 137; 138; GFX9-LABEL: global_atomic_xchg_i32_ret_offset: 139; GFX9: ; %bb.0: 140; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc 142; GFX9-NEXT: s_waitcnt vmcnt(0) 143; GFX9-NEXT: buffer_wbinvl1_vol 144; GFX9-NEXT: s_setpc_b64 s[30:31] 145 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 146 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst 147 ret i32 %result 148} 149 150define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 151; SI-LABEL: global_atomic_xchg_i32_noret_scalar: 152; SI: ; %bb.0: 153; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 155; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 156; SI-NEXT: s_mov_b64 exec, s[34:35] 157; SI-NEXT: s_waitcnt expcnt(0) 158; SI-NEXT: v_writelane_b32 v1, s6, 0 159; SI-NEXT: v_writelane_b32 v1, s7, 1 160; SI-NEXT: s_mov_b32 s34, s6 161; SI-NEXT: s_mov_b32 s7, 0xf000 162; SI-NEXT: s_mov_b32 s6, -1 163; SI-NEXT: v_mov_b32_e32 v0, s34 164; SI-NEXT: s_waitcnt vmcnt(0) 165; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 166; SI-NEXT: s_waitcnt vmcnt(0) 167; SI-NEXT: buffer_wbinvl1 168; SI-NEXT: v_readlane_b32 s7, v1, 1 169; SI-NEXT: v_readlane_b32 s6, v1, 0 170; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 171; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 172; SI-NEXT: s_mov_b64 exec, s[34:35] 173; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 174; SI-NEXT: s_setpc_b64 s[30:31] 175; 176; VI-LABEL: global_atomic_xchg_i32_noret_scalar: 177; VI: ; %bb.0: 178; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 179; VI-NEXT: v_mov_b32_e32 v0, s4 180; VI-NEXT: v_mov_b32_e32 v1, s5 181; VI-NEXT: v_mov_b32_e32 v2, s6 182; VI-NEXT: flat_atomic_swap v[0:1], v2 183; VI-NEXT: s_waitcnt vmcnt(0) 184; VI-NEXT: buffer_wbinvl1_vol 185; VI-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX9-LABEL: global_atomic_xchg_i32_noret_scalar: 188; GFX9: ; %bb.0: 189; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX9-NEXT: v_mov_b32_e32 v0, 0 191; GFX9-NEXT: v_mov_b32_e32 v1, s6 192; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] 193; GFX9-NEXT: s_waitcnt vmcnt(0) 194; GFX9-NEXT: buffer_wbinvl1_vol 195; GFX9-NEXT: s_setpc_b64 s[30:31] 196 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst 197 ret void 198} 199 200define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 201; SI-LABEL: global_atomic_xchg_i32_noret_offset_scalar: 202; SI: ; %bb.0: 203; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 204; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 205; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 206; SI-NEXT: s_mov_b64 exec, s[34:35] 207; SI-NEXT: s_waitcnt expcnt(0) 208; SI-NEXT: v_writelane_b32 v1, s6, 0 209; SI-NEXT: v_writelane_b32 v1, s7, 1 210; SI-NEXT: s_mov_b32 s34, s6 211; SI-NEXT: s_mov_b32 s7, 0xf000 212; SI-NEXT: s_mov_b32 s6, -1 213; SI-NEXT: v_mov_b32_e32 v0, s34 214; SI-NEXT: s_waitcnt vmcnt(0) 215; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 216; SI-NEXT: s_waitcnt vmcnt(0) 217; SI-NEXT: buffer_wbinvl1 218; SI-NEXT: v_readlane_b32 s7, v1, 1 219; SI-NEXT: v_readlane_b32 s6, v1, 0 220; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 221; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 222; SI-NEXT: s_mov_b64 exec, s[34:35] 223; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 224; SI-NEXT: s_setpc_b64 s[30:31] 225; 226; VI-LABEL: global_atomic_xchg_i32_noret_offset_scalar: 227; VI: ; %bb.0: 228; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; VI-NEXT: s_add_u32 s34, s4, 16 230; VI-NEXT: s_addc_u32 s35, s5, 0 231; VI-NEXT: v_mov_b32_e32 v0, s34 232; VI-NEXT: v_mov_b32_e32 v1, s35 233; VI-NEXT: v_mov_b32_e32 v2, s6 234; VI-NEXT: flat_atomic_swap v[0:1], v2 235; VI-NEXT: s_waitcnt vmcnt(0) 236; VI-NEXT: buffer_wbinvl1_vol 237; VI-NEXT: s_setpc_b64 s[30:31] 238; 239; GFX9-LABEL: global_atomic_xchg_i32_noret_offset_scalar: 240; GFX9: ; %bb.0: 241; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 242; GFX9-NEXT: v_mov_b32_e32 v0, 0 243; GFX9-NEXT: v_mov_b32_e32 v1, s6 244; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16 245; GFX9-NEXT: s_waitcnt vmcnt(0) 246; GFX9-NEXT: buffer_wbinvl1_vol 247; GFX9-NEXT: s_setpc_b64 s[30:31] 248 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 249 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst 250 ret void 251} 252 253define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 254; SI-LABEL: global_atomic_xchg_i32_ret_scalar: 255; SI: ; %bb.0: 256; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 257; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 258; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 259; SI-NEXT: s_mov_b64 exec, s[34:35] 260; SI-NEXT: s_waitcnt expcnt(0) 261; SI-NEXT: v_writelane_b32 v1, s6, 0 262; SI-NEXT: v_writelane_b32 v1, s7, 1 263; SI-NEXT: s_mov_b32 s34, s6 264; SI-NEXT: s_mov_b32 s7, 0xf000 265; SI-NEXT: s_mov_b32 s6, -1 266; SI-NEXT: v_mov_b32_e32 v0, s34 267; SI-NEXT: s_waitcnt vmcnt(0) 268; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 269; SI-NEXT: s_waitcnt vmcnt(0) 270; SI-NEXT: buffer_wbinvl1 271; SI-NEXT: v_readlane_b32 s7, v1, 1 272; SI-NEXT: v_readlane_b32 s6, v1, 0 273; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 274; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 275; SI-NEXT: s_mov_b64 exec, s[34:35] 276; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 277; SI-NEXT: s_setpc_b64 s[30:31] 278; 279; VI-LABEL: global_atomic_xchg_i32_ret_scalar: 280; VI: ; %bb.0: 281; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 282; VI-NEXT: v_mov_b32_e32 v0, s4 283; VI-NEXT: v_mov_b32_e32 v1, s5 284; VI-NEXT: v_mov_b32_e32 v2, s6 285; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 286; VI-NEXT: s_waitcnt vmcnt(0) 287; VI-NEXT: buffer_wbinvl1_vol 288; VI-NEXT: s_setpc_b64 s[30:31] 289; 290; GFX9-LABEL: global_atomic_xchg_i32_ret_scalar: 291; GFX9: ; %bb.0: 292; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; GFX9-NEXT: v_mov_b32_e32 v0, 0 294; GFX9-NEXT: v_mov_b32_e32 v1, s6 295; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc 296; GFX9-NEXT: s_waitcnt vmcnt(0) 297; GFX9-NEXT: buffer_wbinvl1_vol 298; GFX9-NEXT: s_setpc_b64 s[30:31] 299 %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst 300 ret i32 %result 301} 302 303define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 304; SI-LABEL: global_atomic_xchg_i32_ret_offset_scalar: 305; SI: ; %bb.0: 306; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 308; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 309; SI-NEXT: s_mov_b64 exec, s[34:35] 310; SI-NEXT: s_waitcnt expcnt(0) 311; SI-NEXT: v_writelane_b32 v1, s6, 0 312; SI-NEXT: v_writelane_b32 v1, s7, 1 313; SI-NEXT: s_mov_b32 s34, s6 314; SI-NEXT: s_mov_b32 s7, 0xf000 315; SI-NEXT: s_mov_b32 s6, -1 316; SI-NEXT: v_mov_b32_e32 v0, s34 317; SI-NEXT: s_waitcnt vmcnt(0) 318; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc 319; SI-NEXT: s_waitcnt vmcnt(0) 320; SI-NEXT: buffer_wbinvl1 321; SI-NEXT: v_readlane_b32 s7, v1, 1 322; SI-NEXT: v_readlane_b32 s6, v1, 0 323; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 324; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 325; SI-NEXT: s_mov_b64 exec, s[34:35] 326; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 327; SI-NEXT: s_setpc_b64 s[30:31] 328; 329; VI-LABEL: global_atomic_xchg_i32_ret_offset_scalar: 330; VI: ; %bb.0: 331; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 332; VI-NEXT: s_add_u32 s34, s4, 16 333; VI-NEXT: s_addc_u32 s35, s5, 0 334; VI-NEXT: v_mov_b32_e32 v0, s34 335; VI-NEXT: v_mov_b32_e32 v1, s35 336; VI-NEXT: v_mov_b32_e32 v2, s6 337; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 338; VI-NEXT: s_waitcnt vmcnt(0) 339; VI-NEXT: buffer_wbinvl1_vol 340; VI-NEXT: s_setpc_b64 s[30:31] 341; 342; GFX9-LABEL: global_atomic_xchg_i32_ret_offset_scalar: 343; GFX9: ; %bb.0: 344; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 345; GFX9-NEXT: v_mov_b32_e32 v0, 0 346; GFX9-NEXT: v_mov_b32_e32 v1, s6 347; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc 348; GFX9-NEXT: s_waitcnt vmcnt(0) 349; GFX9-NEXT: buffer_wbinvl1_vol 350; GFX9-NEXT: s_setpc_b64 s[30:31] 351 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 352 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst 353 ret i32 %result 354} 355 356define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 357; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory: 358; SI: ; %bb.0: 359; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 360; SI-NEXT: s_mov_b32 s6, 0 361; SI-NEXT: s_mov_b32 s7, 0xf000 362; SI-NEXT: s_mov_b32 s4, s6 363; SI-NEXT: s_mov_b32 s5, s6 364; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 365; SI-NEXT: s_waitcnt vmcnt(0) 366; SI-NEXT: buffer_wbinvl1 367; SI-NEXT: s_waitcnt expcnt(0) 368; SI-NEXT: s_setpc_b64 s[30:31] 369; 370; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory: 371; VI: ; %bb.0: 372; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 373; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 374; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 375; VI-NEXT: flat_atomic_swap v[0:1], v2 376; VI-NEXT: s_waitcnt vmcnt(0) 377; VI-NEXT: buffer_wbinvl1_vol 378; VI-NEXT: s_setpc_b64 s[30:31] 379; 380; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory: 381; GFX9: ; %bb.0: 382; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 383; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 384; GFX9-NEXT: s_waitcnt vmcnt(0) 385; GFX9-NEXT: buffer_wbinvl1_vol 386; GFX9-NEXT: s_setpc_b64 s[30:31] 387 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 388 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 389 ret void 390} 391 392define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 393; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory: 394; SI: ; %bb.0: 395; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 396; SI-NEXT: s_mov_b32 s6, 0 397; SI-NEXT: s_mov_b32 s7, 0xf000 398; SI-NEXT: s_mov_b32 s4, s6 399; SI-NEXT: s_mov_b32 s5, s6 400; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 401; SI-NEXT: s_waitcnt vmcnt(0) 402; SI-NEXT: buffer_wbinvl1 403; SI-NEXT: v_mov_b32_e32 v0, v2 404; SI-NEXT: s_waitcnt expcnt(0) 405; SI-NEXT: s_setpc_b64 s[30:31] 406; 407; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory: 408; VI: ; %bb.0: 409; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 410; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 411; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 412; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 413; VI-NEXT: s_waitcnt vmcnt(0) 414; VI-NEXT: buffer_wbinvl1_vol 415; VI-NEXT: s_setpc_b64 s[30:31] 416; 417; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory: 418; GFX9: ; %bb.0: 419; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 420; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc 421; GFX9-NEXT: s_waitcnt vmcnt(0) 422; GFX9-NEXT: buffer_wbinvl1_vol 423; GFX9-NEXT: s_setpc_b64 s[30:31] 424 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 425 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 426 ret i32 %result 427} 428 429; --------------------------------------------------------------------- 430; atomicrmw xchg f32 431; --------------------------------------------------------------------- 432 433define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) { 434; SI-LABEL: global_atomic_xchg_f32_noret: 435; SI: ; %bb.0: 436; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 437; SI-NEXT: s_mov_b32 s6, 0 438; SI-NEXT: s_mov_b32 s7, 0xf000 439; SI-NEXT: s_mov_b32 s4, s6 440; SI-NEXT: s_mov_b32 s5, s6 441; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 442; SI-NEXT: s_waitcnt vmcnt(0) 443; SI-NEXT: buffer_wbinvl1 444; SI-NEXT: s_waitcnt expcnt(0) 445; SI-NEXT: s_setpc_b64 s[30:31] 446; 447; VI-LABEL: global_atomic_xchg_f32_noret: 448; VI: ; %bb.0: 449; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 450; VI-NEXT: flat_atomic_swap v[0:1], v2 451; VI-NEXT: s_waitcnt vmcnt(0) 452; VI-NEXT: buffer_wbinvl1_vol 453; VI-NEXT: s_setpc_b64 s[30:31] 454; 455; GFX9-LABEL: global_atomic_xchg_f32_noret: 456; GFX9: ; %bb.0: 457; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX9-NEXT: global_atomic_swap v[0:1], v2, off 459; GFX9-NEXT: s_waitcnt vmcnt(0) 460; GFX9-NEXT: buffer_wbinvl1_vol 461; GFX9-NEXT: s_setpc_b64 s[30:31] 462 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst 463 ret void 464} 465 466define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) { 467; SI-LABEL: global_atomic_xchg_f32_noret_offset: 468; SI: ; %bb.0: 469; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 470; SI-NEXT: s_mov_b32 s6, 0 471; SI-NEXT: s_mov_b32 s7, 0xf000 472; SI-NEXT: s_mov_b32 s4, s6 473; SI-NEXT: s_mov_b32 s5, s6 474; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 475; SI-NEXT: s_waitcnt vmcnt(0) 476; SI-NEXT: buffer_wbinvl1 477; SI-NEXT: s_waitcnt expcnt(0) 478; SI-NEXT: s_setpc_b64 s[30:31] 479; 480; VI-LABEL: global_atomic_xchg_f32_noret_offset: 481; VI: ; %bb.0: 482; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 483; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 484; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 485; VI-NEXT: flat_atomic_swap v[0:1], v2 486; VI-NEXT: s_waitcnt vmcnt(0) 487; VI-NEXT: buffer_wbinvl1_vol 488; VI-NEXT: s_setpc_b64 s[30:31] 489; 490; GFX9-LABEL: global_atomic_xchg_f32_noret_offset: 491; GFX9: ; %bb.0: 492; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 493; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 494; GFX9-NEXT: s_waitcnt vmcnt(0) 495; GFX9-NEXT: buffer_wbinvl1_vol 496; GFX9-NEXT: s_setpc_b64 s[30:31] 497 %gep = getelementptr float, ptr addrspace(1) %out, i32 4 498 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst 499 ret void 500} 501 502define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) { 503; SI-LABEL: global_atomic_xchg_f32_ret: 504; SI: ; %bb.0: 505; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 506; SI-NEXT: s_mov_b32 s6, 0 507; SI-NEXT: s_mov_b32 s7, 0xf000 508; SI-NEXT: s_mov_b32 s4, s6 509; SI-NEXT: s_mov_b32 s5, s6 510; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc 511; SI-NEXT: s_waitcnt vmcnt(0) 512; SI-NEXT: buffer_wbinvl1 513; SI-NEXT: v_mov_b32_e32 v0, v2 514; SI-NEXT: s_waitcnt expcnt(0) 515; SI-NEXT: s_setpc_b64 s[30:31] 516; 517; VI-LABEL: global_atomic_xchg_f32_ret: 518; VI: ; %bb.0: 519; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 520; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 521; VI-NEXT: s_waitcnt vmcnt(0) 522; VI-NEXT: buffer_wbinvl1_vol 523; VI-NEXT: s_setpc_b64 s[30:31] 524; 525; GFX9-LABEL: global_atomic_xchg_f32_ret: 526; GFX9: ; %bb.0: 527; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc 529; GFX9-NEXT: s_waitcnt vmcnt(0) 530; GFX9-NEXT: buffer_wbinvl1_vol 531; GFX9-NEXT: s_setpc_b64 s[30:31] 532 %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst 533 ret float %result 534} 535 536define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) { 537; SI-LABEL: global_atomic_xchg_f32_ret_offset: 538; SI: ; %bb.0: 539; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 540; SI-NEXT: s_mov_b32 s6, 0 541; SI-NEXT: s_mov_b32 s7, 0xf000 542; SI-NEXT: s_mov_b32 s4, s6 543; SI-NEXT: s_mov_b32 s5, s6 544; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 545; SI-NEXT: s_waitcnt vmcnt(0) 546; SI-NEXT: buffer_wbinvl1 547; SI-NEXT: v_mov_b32_e32 v0, v2 548; SI-NEXT: s_waitcnt expcnt(0) 549; SI-NEXT: s_setpc_b64 s[30:31] 550; 551; VI-LABEL: global_atomic_xchg_f32_ret_offset: 552; VI: ; %bb.0: 553; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 554; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 555; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 556; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 557; VI-NEXT: s_waitcnt vmcnt(0) 558; VI-NEXT: buffer_wbinvl1_vol 559; VI-NEXT: s_setpc_b64 s[30:31] 560; 561; GFX9-LABEL: global_atomic_xchg_f32_ret_offset: 562; GFX9: ; %bb.0: 563; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 564; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc 565; GFX9-NEXT: s_waitcnt vmcnt(0) 566; GFX9-NEXT: buffer_wbinvl1_vol 567; GFX9-NEXT: s_setpc_b64 s[30:31] 568 %gep = getelementptr float, ptr addrspace(1) %out, i32 4 569 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst 570 ret float %result 571} 572 573define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) { 574; SI-LABEL: global_atomic_xchg_f32_noret_scalar: 575; SI: ; %bb.0: 576; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 577; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 578; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 579; SI-NEXT: s_mov_b64 exec, s[34:35] 580; SI-NEXT: s_waitcnt expcnt(0) 581; SI-NEXT: v_writelane_b32 v1, s6, 0 582; SI-NEXT: v_writelane_b32 v1, s7, 1 583; SI-NEXT: s_mov_b32 s34, s6 584; SI-NEXT: s_mov_b32 s7, 0xf000 585; SI-NEXT: s_mov_b32 s6, -1 586; SI-NEXT: v_mov_b32_e32 v0, s34 587; SI-NEXT: s_waitcnt vmcnt(0) 588; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 589; SI-NEXT: s_waitcnt vmcnt(0) 590; SI-NEXT: buffer_wbinvl1 591; SI-NEXT: v_readlane_b32 s7, v1, 1 592; SI-NEXT: v_readlane_b32 s6, v1, 0 593; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 594; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 595; SI-NEXT: s_mov_b64 exec, s[34:35] 596; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 597; SI-NEXT: s_setpc_b64 s[30:31] 598; 599; VI-LABEL: global_atomic_xchg_f32_noret_scalar: 600; VI: ; %bb.0: 601; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 602; VI-NEXT: v_mov_b32_e32 v0, s4 603; VI-NEXT: v_mov_b32_e32 v1, s5 604; VI-NEXT: v_mov_b32_e32 v2, s6 605; VI-NEXT: flat_atomic_swap v[0:1], v2 606; VI-NEXT: s_waitcnt vmcnt(0) 607; VI-NEXT: buffer_wbinvl1_vol 608; VI-NEXT: s_setpc_b64 s[30:31] 609; 610; GFX9-LABEL: global_atomic_xchg_f32_noret_scalar: 611; GFX9: ; %bb.0: 612; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 613; GFX9-NEXT: v_mov_b32_e32 v0, 0 614; GFX9-NEXT: v_mov_b32_e32 v1, s6 615; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] 616; GFX9-NEXT: s_waitcnt vmcnt(0) 617; GFX9-NEXT: buffer_wbinvl1_vol 618; GFX9-NEXT: s_setpc_b64 s[30:31] 619 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst 620 ret void 621} 622 623define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) { 624; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar: 625; SI: ; %bb.0: 626; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 627; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 628; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 629; SI-NEXT: s_mov_b64 exec, s[34:35] 630; SI-NEXT: s_waitcnt expcnt(0) 631; SI-NEXT: v_writelane_b32 v1, s6, 0 632; SI-NEXT: v_writelane_b32 v1, s7, 1 633; SI-NEXT: s_mov_b32 s34, s6 634; SI-NEXT: s_mov_b32 s7, 0xf000 635; SI-NEXT: s_mov_b32 s6, -1 636; SI-NEXT: v_mov_b32_e32 v0, s34 637; SI-NEXT: s_waitcnt vmcnt(0) 638; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 639; SI-NEXT: s_waitcnt vmcnt(0) 640; SI-NEXT: buffer_wbinvl1 641; SI-NEXT: v_readlane_b32 s7, v1, 1 642; SI-NEXT: v_readlane_b32 s6, v1, 0 643; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 644; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 645; SI-NEXT: s_mov_b64 exec, s[34:35] 646; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 647; SI-NEXT: s_setpc_b64 s[30:31] 648; 649; VI-LABEL: global_atomic_xchg_f32_noret_offset_scalar: 650; VI: ; %bb.0: 651; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 652; VI-NEXT: s_add_u32 s34, s4, 16 653; VI-NEXT: s_addc_u32 s35, s5, 0 654; VI-NEXT: v_mov_b32_e32 v0, s34 655; VI-NEXT: v_mov_b32_e32 v1, s35 656; VI-NEXT: v_mov_b32_e32 v2, s6 657; VI-NEXT: flat_atomic_swap v[0:1], v2 658; VI-NEXT: s_waitcnt vmcnt(0) 659; VI-NEXT: buffer_wbinvl1_vol 660; VI-NEXT: s_setpc_b64 s[30:31] 661; 662; GFX9-LABEL: global_atomic_xchg_f32_noret_offset_scalar: 663; GFX9: ; %bb.0: 664; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX9-NEXT: v_mov_b32_e32 v0, 0 666; GFX9-NEXT: v_mov_b32_e32 v1, s6 667; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16 668; GFX9-NEXT: s_waitcnt vmcnt(0) 669; GFX9-NEXT: buffer_wbinvl1_vol 670; GFX9-NEXT: s_setpc_b64 s[30:31] 671 %gep = getelementptr float, ptr addrspace(1) %out, i32 4 672 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst 673 ret void 674} 675 676define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) { 677; SI-LABEL: global_atomic_xchg_f32_ret_scalar: 678; SI: ; %bb.0: 679; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 680; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 681; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 682; SI-NEXT: s_mov_b64 exec, s[34:35] 683; SI-NEXT: s_waitcnt expcnt(0) 684; SI-NEXT: v_writelane_b32 v1, s6, 0 685; SI-NEXT: v_writelane_b32 v1, s7, 1 686; SI-NEXT: s_mov_b32 s34, s6 687; SI-NEXT: s_mov_b32 s7, 0xf000 688; SI-NEXT: s_mov_b32 s6, -1 689; SI-NEXT: v_mov_b32_e32 v0, s34 690; SI-NEXT: s_waitcnt vmcnt(0) 691; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 692; SI-NEXT: s_waitcnt vmcnt(0) 693; SI-NEXT: buffer_wbinvl1 694; SI-NEXT: v_readlane_b32 s7, v1, 1 695; SI-NEXT: v_readlane_b32 s6, v1, 0 696; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 697; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 698; SI-NEXT: s_mov_b64 exec, s[34:35] 699; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 700; SI-NEXT: s_setpc_b64 s[30:31] 701; 702; VI-LABEL: global_atomic_xchg_f32_ret_scalar: 703; VI: ; %bb.0: 704; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 705; VI-NEXT: v_mov_b32_e32 v0, s4 706; VI-NEXT: v_mov_b32_e32 v1, s5 707; VI-NEXT: v_mov_b32_e32 v2, s6 708; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 709; VI-NEXT: s_waitcnt vmcnt(0) 710; VI-NEXT: buffer_wbinvl1_vol 711; VI-NEXT: s_setpc_b64 s[30:31] 712; 713; GFX9-LABEL: global_atomic_xchg_f32_ret_scalar: 714; GFX9: ; %bb.0: 715; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 716; GFX9-NEXT: v_mov_b32_e32 v0, 0 717; GFX9-NEXT: v_mov_b32_e32 v1, s6 718; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc 719; GFX9-NEXT: s_waitcnt vmcnt(0) 720; GFX9-NEXT: buffer_wbinvl1_vol 721; GFX9-NEXT: s_setpc_b64 s[30:31] 722 %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst 723 ret float %result 724} 725 726define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) { 727; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar: 728; SI: ; %bb.0: 729; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 730; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 731; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 732; SI-NEXT: s_mov_b64 exec, s[34:35] 733; SI-NEXT: s_waitcnt expcnt(0) 734; SI-NEXT: v_writelane_b32 v1, s6, 0 735; SI-NEXT: v_writelane_b32 v1, s7, 1 736; SI-NEXT: s_mov_b32 s34, s6 737; SI-NEXT: s_mov_b32 s7, 0xf000 738; SI-NEXT: s_mov_b32 s6, -1 739; SI-NEXT: v_mov_b32_e32 v0, s34 740; SI-NEXT: s_waitcnt vmcnt(0) 741; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc 742; SI-NEXT: s_waitcnt vmcnt(0) 743; SI-NEXT: buffer_wbinvl1 744; SI-NEXT: v_readlane_b32 s7, v1, 1 745; SI-NEXT: v_readlane_b32 s6, v1, 0 746; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 747; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 748; SI-NEXT: s_mov_b64 exec, s[34:35] 749; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 750; SI-NEXT: s_setpc_b64 s[30:31] 751; 752; VI-LABEL: global_atomic_xchg_f32_ret_offset_scalar: 753; VI: ; %bb.0: 754; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 755; VI-NEXT: s_add_u32 s34, s4, 16 756; VI-NEXT: s_addc_u32 s35, s5, 0 757; VI-NEXT: v_mov_b32_e32 v0, s34 758; VI-NEXT: v_mov_b32_e32 v1, s35 759; VI-NEXT: v_mov_b32_e32 v2, s6 760; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 761; VI-NEXT: s_waitcnt vmcnt(0) 762; VI-NEXT: buffer_wbinvl1_vol 763; VI-NEXT: s_setpc_b64 s[30:31] 764; 765; GFX9-LABEL: global_atomic_xchg_f32_ret_offset_scalar: 766; GFX9: ; %bb.0: 767; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 768; GFX9-NEXT: v_mov_b32_e32 v0, 0 769; GFX9-NEXT: v_mov_b32_e32 v1, s6 770; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc 771; GFX9-NEXT: s_waitcnt vmcnt(0) 772; GFX9-NEXT: buffer_wbinvl1_vol 773; GFX9-NEXT: s_setpc_b64 s[30:31] 774 %gep = getelementptr float, ptr addrspace(1) %out, i32 4 775 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst 776 ret float %result 777} 778 779define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) { 780; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory: 781; SI: ; %bb.0: 782; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 783; SI-NEXT: s_mov_b32 s6, 0 784; SI-NEXT: s_mov_b32 s7, 0xf000 785; SI-NEXT: s_mov_b32 s4, s6 786; SI-NEXT: s_mov_b32 s5, s6 787; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 788; SI-NEXT: s_waitcnt vmcnt(0) 789; SI-NEXT: buffer_wbinvl1 790; SI-NEXT: s_waitcnt expcnt(0) 791; SI-NEXT: s_setpc_b64 s[30:31] 792; 793; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory: 794; VI: ; %bb.0: 795; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 796; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 797; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 798; VI-NEXT: flat_atomic_swap v[0:1], v2 799; VI-NEXT: s_waitcnt vmcnt(0) 800; VI-NEXT: buffer_wbinvl1_vol 801; VI-NEXT: s_setpc_b64 s[30:31] 802; 803; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory: 804; GFX9: ; %bb.0: 805; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 806; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 807; GFX9-NEXT: s_waitcnt vmcnt(0) 808; GFX9-NEXT: buffer_wbinvl1_vol 809; GFX9-NEXT: s_setpc_b64 s[30:31] 810 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 811 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 812 ret void 813} 814 815define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) { 816; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory: 817; SI: ; %bb.0: 818; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 819; SI-NEXT: s_mov_b32 s6, 0 820; SI-NEXT: s_mov_b32 s7, 0xf000 821; SI-NEXT: s_mov_b32 s4, s6 822; SI-NEXT: s_mov_b32 s5, s6 823; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 824; SI-NEXT: s_waitcnt vmcnt(0) 825; SI-NEXT: buffer_wbinvl1 826; SI-NEXT: v_mov_b32_e32 v0, v2 827; SI-NEXT: s_waitcnt expcnt(0) 828; SI-NEXT: s_setpc_b64 s[30:31] 829; 830; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory: 831; VI: ; %bb.0: 832; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 833; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 834; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 835; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 836; VI-NEXT: s_waitcnt vmcnt(0) 837; VI-NEXT: buffer_wbinvl1_vol 838; VI-NEXT: s_setpc_b64 s[30:31] 839; 840; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory: 841; GFX9: ; %bb.0: 842; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 843; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc 844; GFX9-NEXT: s_waitcnt vmcnt(0) 845; GFX9-NEXT: buffer_wbinvl1_vol 846; GFX9-NEXT: s_setpc_b64 s[30:31] 847 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 848 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 849 ret float %result 850} 851 852; --------------------------------------------------------------------- 853; atomicrmw add 854; --------------------------------------------------------------------- 855 856define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 857; SI-LABEL: global_atomic_add_i32_noret: 858; SI: ; %bb.0: 859; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 860; SI-NEXT: s_mov_b32 s6, 0 861; SI-NEXT: s_mov_b32 s7, 0xf000 862; SI-NEXT: s_mov_b32 s4, s6 863; SI-NEXT: s_mov_b32 s5, s6 864; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 865; SI-NEXT: s_waitcnt vmcnt(0) 866; SI-NEXT: buffer_wbinvl1 867; SI-NEXT: s_waitcnt expcnt(0) 868; SI-NEXT: s_setpc_b64 s[30:31] 869; 870; VI-LABEL: global_atomic_add_i32_noret: 871; VI: ; %bb.0: 872; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 873; VI-NEXT: flat_atomic_add v[0:1], v2 874; VI-NEXT: s_waitcnt vmcnt(0) 875; VI-NEXT: buffer_wbinvl1_vol 876; VI-NEXT: s_setpc_b64 s[30:31] 877; 878; GFX9-LABEL: global_atomic_add_i32_noret: 879; GFX9: ; %bb.0: 880; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 881; GFX9-NEXT: global_atomic_add v[0:1], v2, off 882; GFX9-NEXT: s_waitcnt vmcnt(0) 883; GFX9-NEXT: buffer_wbinvl1_vol 884; GFX9-NEXT: s_setpc_b64 s[30:31] 885 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst 886 ret void 887} 888 889define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 890; SI-LABEL: global_atomic_add_i32_noret_offset: 891; SI: ; %bb.0: 892; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 893; SI-NEXT: s_mov_b32 s6, 0 894; SI-NEXT: s_mov_b32 s7, 0xf000 895; SI-NEXT: s_mov_b32 s4, s6 896; SI-NEXT: s_mov_b32 s5, s6 897; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 898; SI-NEXT: s_waitcnt vmcnt(0) 899; SI-NEXT: buffer_wbinvl1 900; SI-NEXT: s_waitcnt expcnt(0) 901; SI-NEXT: s_setpc_b64 s[30:31] 902; 903; VI-LABEL: global_atomic_add_i32_noret_offset: 904; VI: ; %bb.0: 905; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 906; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 907; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 908; VI-NEXT: flat_atomic_add v[0:1], v2 909; VI-NEXT: s_waitcnt vmcnt(0) 910; VI-NEXT: buffer_wbinvl1_vol 911; VI-NEXT: s_setpc_b64 s[30:31] 912; 913; GFX9-LABEL: global_atomic_add_i32_noret_offset: 914; GFX9: ; %bb.0: 915; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 916; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16 917; GFX9-NEXT: s_waitcnt vmcnt(0) 918; GFX9-NEXT: buffer_wbinvl1_vol 919; GFX9-NEXT: s_setpc_b64 s[30:31] 920 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 921 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst 922 ret void 923} 924 925define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 926; SI-LABEL: global_atomic_add_i32_ret: 927; SI: ; %bb.0: 928; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 929; SI-NEXT: s_mov_b32 s6, 0 930; SI-NEXT: s_mov_b32 s7, 0xf000 931; SI-NEXT: s_mov_b32 s4, s6 932; SI-NEXT: s_mov_b32 s5, s6 933; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc 934; SI-NEXT: s_waitcnt vmcnt(0) 935; SI-NEXT: buffer_wbinvl1 936; SI-NEXT: v_mov_b32_e32 v0, v2 937; SI-NEXT: s_waitcnt expcnt(0) 938; SI-NEXT: s_setpc_b64 s[30:31] 939; 940; VI-LABEL: global_atomic_add_i32_ret: 941; VI: ; %bb.0: 942; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 943; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 944; VI-NEXT: s_waitcnt vmcnt(0) 945; VI-NEXT: buffer_wbinvl1_vol 946; VI-NEXT: s_setpc_b64 s[30:31] 947; 948; GFX9-LABEL: global_atomic_add_i32_ret: 949; GFX9: ; %bb.0: 950; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 951; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc 952; GFX9-NEXT: s_waitcnt vmcnt(0) 953; GFX9-NEXT: buffer_wbinvl1_vol 954; GFX9-NEXT: s_setpc_b64 s[30:31] 955 %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst 956 ret i32 %result 957} 958 959define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 960; SI-LABEL: global_atomic_add_i32_ret_offset: 961; SI: ; %bb.0: 962; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 963; SI-NEXT: s_mov_b32 s6, 0 964; SI-NEXT: s_mov_b32 s7, 0xf000 965; SI-NEXT: s_mov_b32 s4, s6 966; SI-NEXT: s_mov_b32 s5, s6 967; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 968; SI-NEXT: s_waitcnt vmcnt(0) 969; SI-NEXT: buffer_wbinvl1 970; SI-NEXT: v_mov_b32_e32 v0, v2 971; SI-NEXT: s_waitcnt expcnt(0) 972; SI-NEXT: s_setpc_b64 s[30:31] 973; 974; VI-LABEL: global_atomic_add_i32_ret_offset: 975; VI: ; %bb.0: 976; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 977; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 978; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 979; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 980; VI-NEXT: s_waitcnt vmcnt(0) 981; VI-NEXT: buffer_wbinvl1_vol 982; VI-NEXT: s_setpc_b64 s[30:31] 983; 984; GFX9-LABEL: global_atomic_add_i32_ret_offset: 985; GFX9: ; %bb.0: 986; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 987; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc 988; GFX9-NEXT: s_waitcnt vmcnt(0) 989; GFX9-NEXT: buffer_wbinvl1_vol 990; GFX9-NEXT: s_setpc_b64 s[30:31] 991 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 992 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst 993 ret i32 %result 994} 995 996define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 997; SI-LABEL: global_atomic_add_i32_noret_scalar: 998; SI: ; %bb.0: 999; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1000; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1001; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1002; SI-NEXT: s_mov_b64 exec, s[34:35] 1003; SI-NEXT: s_waitcnt expcnt(0) 1004; SI-NEXT: v_writelane_b32 v1, s6, 0 1005; SI-NEXT: v_writelane_b32 v1, s7, 1 1006; SI-NEXT: s_mov_b32 s34, s6 1007; SI-NEXT: s_mov_b32 s7, 0xf000 1008; SI-NEXT: s_mov_b32 s6, -1 1009; SI-NEXT: v_mov_b32_e32 v0, s34 1010; SI-NEXT: s_waitcnt vmcnt(0) 1011; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 1012; SI-NEXT: s_waitcnt vmcnt(0) 1013; SI-NEXT: buffer_wbinvl1 1014; SI-NEXT: v_readlane_b32 s7, v1, 1 1015; SI-NEXT: v_readlane_b32 s6, v1, 0 1016; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1017; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1018; SI-NEXT: s_mov_b64 exec, s[34:35] 1019; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1020; SI-NEXT: s_setpc_b64 s[30:31] 1021; 1022; VI-LABEL: global_atomic_add_i32_noret_scalar: 1023; VI: ; %bb.0: 1024; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1025; VI-NEXT: v_mov_b32_e32 v0, s4 1026; VI-NEXT: v_mov_b32_e32 v1, s5 1027; VI-NEXT: v_mov_b32_e32 v2, s6 1028; VI-NEXT: flat_atomic_add v[0:1], v2 1029; VI-NEXT: s_waitcnt vmcnt(0) 1030; VI-NEXT: buffer_wbinvl1_vol 1031; VI-NEXT: s_setpc_b64 s[30:31] 1032; 1033; GFX9-LABEL: global_atomic_add_i32_noret_scalar: 1034; GFX9: ; %bb.0: 1035; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1036; GFX9-NEXT: v_mov_b32_e32 v0, 0 1037; GFX9-NEXT: v_mov_b32_e32 v1, s6 1038; GFX9-NEXT: global_atomic_add v0, v1, s[4:5] 1039; GFX9-NEXT: s_waitcnt vmcnt(0) 1040; GFX9-NEXT: buffer_wbinvl1_vol 1041; GFX9-NEXT: s_setpc_b64 s[30:31] 1042 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst 1043 ret void 1044} 1045 1046define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 1047; SI-LABEL: global_atomic_add_i32_noret_offset_scalar: 1048; SI: ; %bb.0: 1049; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1050; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1051; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1052; SI-NEXT: s_mov_b64 exec, s[34:35] 1053; SI-NEXT: s_waitcnt expcnt(0) 1054; SI-NEXT: v_writelane_b32 v1, s6, 0 1055; SI-NEXT: v_writelane_b32 v1, s7, 1 1056; SI-NEXT: s_mov_b32 s34, s6 1057; SI-NEXT: s_mov_b32 s7, 0xf000 1058; SI-NEXT: s_mov_b32 s6, -1 1059; SI-NEXT: v_mov_b32_e32 v0, s34 1060; SI-NEXT: s_waitcnt vmcnt(0) 1061; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 1062; SI-NEXT: s_waitcnt vmcnt(0) 1063; SI-NEXT: buffer_wbinvl1 1064; SI-NEXT: v_readlane_b32 s7, v1, 1 1065; SI-NEXT: v_readlane_b32 s6, v1, 0 1066; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1067; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1068; SI-NEXT: s_mov_b64 exec, s[34:35] 1069; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1070; SI-NEXT: s_setpc_b64 s[30:31] 1071; 1072; VI-LABEL: global_atomic_add_i32_noret_offset_scalar: 1073; VI: ; %bb.0: 1074; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1075; VI-NEXT: s_add_u32 s34, s4, 16 1076; VI-NEXT: s_addc_u32 s35, s5, 0 1077; VI-NEXT: v_mov_b32_e32 v0, s34 1078; VI-NEXT: v_mov_b32_e32 v1, s35 1079; VI-NEXT: v_mov_b32_e32 v2, s6 1080; VI-NEXT: flat_atomic_add v[0:1], v2 1081; VI-NEXT: s_waitcnt vmcnt(0) 1082; VI-NEXT: buffer_wbinvl1_vol 1083; VI-NEXT: s_setpc_b64 s[30:31] 1084; 1085; GFX9-LABEL: global_atomic_add_i32_noret_offset_scalar: 1086; GFX9: ; %bb.0: 1087; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1088; GFX9-NEXT: v_mov_b32_e32 v0, 0 1089; GFX9-NEXT: v_mov_b32_e32 v1, s6 1090; GFX9-NEXT: global_atomic_add v0, v1, s[4:5] offset:16 1091; GFX9-NEXT: s_waitcnt vmcnt(0) 1092; GFX9-NEXT: buffer_wbinvl1_vol 1093; GFX9-NEXT: s_setpc_b64 s[30:31] 1094 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1095 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst 1096 ret void 1097} 1098 1099define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 1100; SI-LABEL: global_atomic_add_i32_ret_scalar: 1101; SI: ; %bb.0: 1102; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1103; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1104; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1105; SI-NEXT: s_mov_b64 exec, s[34:35] 1106; SI-NEXT: s_waitcnt expcnt(0) 1107; SI-NEXT: v_writelane_b32 v1, s6, 0 1108; SI-NEXT: v_writelane_b32 v1, s7, 1 1109; SI-NEXT: s_mov_b32 s34, s6 1110; SI-NEXT: s_mov_b32 s7, 0xf000 1111; SI-NEXT: s_mov_b32 s6, -1 1112; SI-NEXT: v_mov_b32_e32 v0, s34 1113; SI-NEXT: s_waitcnt vmcnt(0) 1114; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 1115; SI-NEXT: s_waitcnt vmcnt(0) 1116; SI-NEXT: buffer_wbinvl1 1117; SI-NEXT: v_readlane_b32 s7, v1, 1 1118; SI-NEXT: v_readlane_b32 s6, v1, 0 1119; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1120; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1121; SI-NEXT: s_mov_b64 exec, s[34:35] 1122; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1123; SI-NEXT: s_setpc_b64 s[30:31] 1124; 1125; VI-LABEL: global_atomic_add_i32_ret_scalar: 1126; VI: ; %bb.0: 1127; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1128; VI-NEXT: v_mov_b32_e32 v0, s4 1129; VI-NEXT: v_mov_b32_e32 v1, s5 1130; VI-NEXT: v_mov_b32_e32 v2, s6 1131; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 1132; VI-NEXT: s_waitcnt vmcnt(0) 1133; VI-NEXT: buffer_wbinvl1_vol 1134; VI-NEXT: s_setpc_b64 s[30:31] 1135; 1136; GFX9-LABEL: global_atomic_add_i32_ret_scalar: 1137; GFX9: ; %bb.0: 1138; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1139; GFX9-NEXT: v_mov_b32_e32 v0, 0 1140; GFX9-NEXT: v_mov_b32_e32 v1, s6 1141; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] glc 1142; GFX9-NEXT: s_waitcnt vmcnt(0) 1143; GFX9-NEXT: buffer_wbinvl1_vol 1144; GFX9-NEXT: s_setpc_b64 s[30:31] 1145 %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst 1146 ret i32 %result 1147} 1148 1149define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 1150; SI-LABEL: global_atomic_add_i32_ret_offset_scalar: 1151; SI: ; %bb.0: 1152; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1153; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1154; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1155; SI-NEXT: s_mov_b64 exec, s[34:35] 1156; SI-NEXT: s_waitcnt expcnt(0) 1157; SI-NEXT: v_writelane_b32 v1, s6, 0 1158; SI-NEXT: v_writelane_b32 v1, s7, 1 1159; SI-NEXT: s_mov_b32 s34, s6 1160; SI-NEXT: s_mov_b32 s7, 0xf000 1161; SI-NEXT: s_mov_b32 s6, -1 1162; SI-NEXT: v_mov_b32_e32 v0, s34 1163; SI-NEXT: s_waitcnt vmcnt(0) 1164; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc 1165; SI-NEXT: s_waitcnt vmcnt(0) 1166; SI-NEXT: buffer_wbinvl1 1167; SI-NEXT: v_readlane_b32 s7, v1, 1 1168; SI-NEXT: v_readlane_b32 s6, v1, 0 1169; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1170; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1171; SI-NEXT: s_mov_b64 exec, s[34:35] 1172; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1173; SI-NEXT: s_setpc_b64 s[30:31] 1174; 1175; VI-LABEL: global_atomic_add_i32_ret_offset_scalar: 1176; VI: ; %bb.0: 1177; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1178; VI-NEXT: s_add_u32 s34, s4, 16 1179; VI-NEXT: s_addc_u32 s35, s5, 0 1180; VI-NEXT: v_mov_b32_e32 v0, s34 1181; VI-NEXT: v_mov_b32_e32 v1, s35 1182; VI-NEXT: v_mov_b32_e32 v2, s6 1183; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 1184; VI-NEXT: s_waitcnt vmcnt(0) 1185; VI-NEXT: buffer_wbinvl1_vol 1186; VI-NEXT: s_setpc_b64 s[30:31] 1187; 1188; GFX9-LABEL: global_atomic_add_i32_ret_offset_scalar: 1189; GFX9: ; %bb.0: 1190; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1191; GFX9-NEXT: v_mov_b32_e32 v0, 0 1192; GFX9-NEXT: v_mov_b32_e32 v1, s6 1193; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] offset:16 glc 1194; GFX9-NEXT: s_waitcnt vmcnt(0) 1195; GFX9-NEXT: buffer_wbinvl1_vol 1196; GFX9-NEXT: s_setpc_b64 s[30:31] 1197 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1198 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst 1199 ret i32 %result 1200} 1201 1202define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 1203; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory: 1204; SI: ; %bb.0: 1205; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1206; SI-NEXT: s_mov_b32 s6, 0 1207; SI-NEXT: s_mov_b32 s7, 0xf000 1208; SI-NEXT: s_mov_b32 s4, s6 1209; SI-NEXT: s_mov_b32 s5, s6 1210; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 1211; SI-NEXT: s_waitcnt vmcnt(0) 1212; SI-NEXT: buffer_wbinvl1 1213; SI-NEXT: s_waitcnt expcnt(0) 1214; SI-NEXT: s_setpc_b64 s[30:31] 1215; 1216; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory: 1217; VI: ; %bb.0: 1218; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1219; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1220; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1221; VI-NEXT: flat_atomic_add v[0:1], v2 1222; VI-NEXT: s_waitcnt vmcnt(0) 1223; VI-NEXT: buffer_wbinvl1_vol 1224; VI-NEXT: s_setpc_b64 s[30:31] 1225; 1226; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory: 1227; GFX9: ; %bb.0: 1228; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1229; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16 1230; GFX9-NEXT: s_waitcnt vmcnt(0) 1231; GFX9-NEXT: buffer_wbinvl1_vol 1232; GFX9-NEXT: s_setpc_b64 s[30:31] 1233 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1234 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1235 ret void 1236} 1237 1238define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 1239; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory: 1240; SI: ; %bb.0: 1241; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1242; SI-NEXT: s_mov_b32 s6, 0 1243; SI-NEXT: s_mov_b32 s7, 0xf000 1244; SI-NEXT: s_mov_b32 s4, s6 1245; SI-NEXT: s_mov_b32 s5, s6 1246; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 1247; SI-NEXT: s_waitcnt vmcnt(0) 1248; SI-NEXT: buffer_wbinvl1 1249; SI-NEXT: v_mov_b32_e32 v0, v2 1250; SI-NEXT: s_waitcnt expcnt(0) 1251; SI-NEXT: s_setpc_b64 s[30:31] 1252; 1253; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory: 1254; VI: ; %bb.0: 1255; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1256; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1257; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1258; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 1259; VI-NEXT: s_waitcnt vmcnt(0) 1260; VI-NEXT: buffer_wbinvl1_vol 1261; VI-NEXT: s_setpc_b64 s[30:31] 1262; 1263; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory: 1264; GFX9: ; %bb.0: 1265; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1266; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc 1267; GFX9-NEXT: s_waitcnt vmcnt(0) 1268; GFX9-NEXT: buffer_wbinvl1_vol 1269; GFX9-NEXT: s_setpc_b64 s[30:31] 1270 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1271 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1272 ret i32 %result 1273} 1274 1275; --------------------------------------------------------------------- 1276; atomicrmw sub 1277; --------------------------------------------------------------------- 1278 1279define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 1280; SI-LABEL: global_atomic_sub_i32_noret: 1281; SI: ; %bb.0: 1282; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1283; SI-NEXT: s_mov_b32 s6, 0 1284; SI-NEXT: s_mov_b32 s7, 0xf000 1285; SI-NEXT: s_mov_b32 s4, s6 1286; SI-NEXT: s_mov_b32 s5, s6 1287; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 1288; SI-NEXT: s_waitcnt vmcnt(0) 1289; SI-NEXT: buffer_wbinvl1 1290; SI-NEXT: s_waitcnt expcnt(0) 1291; SI-NEXT: s_setpc_b64 s[30:31] 1292; 1293; VI-LABEL: global_atomic_sub_i32_noret: 1294; VI: ; %bb.0: 1295; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1296; VI-NEXT: flat_atomic_sub v[0:1], v2 1297; VI-NEXT: s_waitcnt vmcnt(0) 1298; VI-NEXT: buffer_wbinvl1_vol 1299; VI-NEXT: s_setpc_b64 s[30:31] 1300; 1301; GFX9-LABEL: global_atomic_sub_i32_noret: 1302; GFX9: ; %bb.0: 1303; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1304; GFX9-NEXT: global_atomic_sub v[0:1], v2, off 1305; GFX9-NEXT: s_waitcnt vmcnt(0) 1306; GFX9-NEXT: buffer_wbinvl1_vol 1307; GFX9-NEXT: s_setpc_b64 s[30:31] 1308 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst 1309 ret void 1310} 1311 1312define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 1313; SI-LABEL: global_atomic_sub_i32_noret_offset: 1314; SI: ; %bb.0: 1315; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1316; SI-NEXT: s_mov_b32 s6, 0 1317; SI-NEXT: s_mov_b32 s7, 0xf000 1318; SI-NEXT: s_mov_b32 s4, s6 1319; SI-NEXT: s_mov_b32 s5, s6 1320; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 1321; SI-NEXT: s_waitcnt vmcnt(0) 1322; SI-NEXT: buffer_wbinvl1 1323; SI-NEXT: s_waitcnt expcnt(0) 1324; SI-NEXT: s_setpc_b64 s[30:31] 1325; 1326; VI-LABEL: global_atomic_sub_i32_noret_offset: 1327; VI: ; %bb.0: 1328; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1329; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1330; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1331; VI-NEXT: flat_atomic_sub v[0:1], v2 1332; VI-NEXT: s_waitcnt vmcnt(0) 1333; VI-NEXT: buffer_wbinvl1_vol 1334; VI-NEXT: s_setpc_b64 s[30:31] 1335; 1336; GFX9-LABEL: global_atomic_sub_i32_noret_offset: 1337; GFX9: ; %bb.0: 1338; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1339; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 1340; GFX9-NEXT: s_waitcnt vmcnt(0) 1341; GFX9-NEXT: buffer_wbinvl1_vol 1342; GFX9-NEXT: s_setpc_b64 s[30:31] 1343 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1344 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst 1345 ret void 1346} 1347 1348define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 1349; SI-LABEL: global_atomic_sub_i32_ret: 1350; SI: ; %bb.0: 1351; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1352; SI-NEXT: s_mov_b32 s6, 0 1353; SI-NEXT: s_mov_b32 s7, 0xf000 1354; SI-NEXT: s_mov_b32 s4, s6 1355; SI-NEXT: s_mov_b32 s5, s6 1356; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc 1357; SI-NEXT: s_waitcnt vmcnt(0) 1358; SI-NEXT: buffer_wbinvl1 1359; SI-NEXT: v_mov_b32_e32 v0, v2 1360; SI-NEXT: s_waitcnt expcnt(0) 1361; SI-NEXT: s_setpc_b64 s[30:31] 1362; 1363; VI-LABEL: global_atomic_sub_i32_ret: 1364; VI: ; %bb.0: 1365; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1366; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1367; VI-NEXT: s_waitcnt vmcnt(0) 1368; VI-NEXT: buffer_wbinvl1_vol 1369; VI-NEXT: s_setpc_b64 s[30:31] 1370; 1371; GFX9-LABEL: global_atomic_sub_i32_ret: 1372; GFX9: ; %bb.0: 1373; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1374; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc 1375; GFX9-NEXT: s_waitcnt vmcnt(0) 1376; GFX9-NEXT: buffer_wbinvl1_vol 1377; GFX9-NEXT: s_setpc_b64 s[30:31] 1378 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst 1379 ret i32 %result 1380} 1381 1382define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 1383; SI-LABEL: global_atomic_sub_i32_ret_offset: 1384; SI: ; %bb.0: 1385; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1386; SI-NEXT: s_mov_b32 s6, 0 1387; SI-NEXT: s_mov_b32 s7, 0xf000 1388; SI-NEXT: s_mov_b32 s4, s6 1389; SI-NEXT: s_mov_b32 s5, s6 1390; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 1391; SI-NEXT: s_waitcnt vmcnt(0) 1392; SI-NEXT: buffer_wbinvl1 1393; SI-NEXT: v_mov_b32_e32 v0, v2 1394; SI-NEXT: s_waitcnt expcnt(0) 1395; SI-NEXT: s_setpc_b64 s[30:31] 1396; 1397; VI-LABEL: global_atomic_sub_i32_ret_offset: 1398; VI: ; %bb.0: 1399; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1400; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1401; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1402; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1403; VI-NEXT: s_waitcnt vmcnt(0) 1404; VI-NEXT: buffer_wbinvl1_vol 1405; VI-NEXT: s_setpc_b64 s[30:31] 1406; 1407; GFX9-LABEL: global_atomic_sub_i32_ret_offset: 1408; GFX9: ; %bb.0: 1409; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1410; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc 1411; GFX9-NEXT: s_waitcnt vmcnt(0) 1412; GFX9-NEXT: buffer_wbinvl1_vol 1413; GFX9-NEXT: s_setpc_b64 s[30:31] 1414 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1415 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst 1416 ret i32 %result 1417} 1418 1419define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 1420; SI-LABEL: global_atomic_sub_i32_noret_scalar: 1421; SI: ; %bb.0: 1422; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1423; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1424; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1425; SI-NEXT: s_mov_b64 exec, s[34:35] 1426; SI-NEXT: s_waitcnt expcnt(0) 1427; SI-NEXT: v_writelane_b32 v1, s6, 0 1428; SI-NEXT: v_writelane_b32 v1, s7, 1 1429; SI-NEXT: s_mov_b32 s34, s6 1430; SI-NEXT: s_mov_b32 s7, 0xf000 1431; SI-NEXT: s_mov_b32 s6, -1 1432; SI-NEXT: v_mov_b32_e32 v0, s34 1433; SI-NEXT: s_waitcnt vmcnt(0) 1434; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 1435; SI-NEXT: s_waitcnt vmcnt(0) 1436; SI-NEXT: buffer_wbinvl1 1437; SI-NEXT: v_readlane_b32 s7, v1, 1 1438; SI-NEXT: v_readlane_b32 s6, v1, 0 1439; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1440; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1441; SI-NEXT: s_mov_b64 exec, s[34:35] 1442; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1443; SI-NEXT: s_setpc_b64 s[30:31] 1444; 1445; VI-LABEL: global_atomic_sub_i32_noret_scalar: 1446; VI: ; %bb.0: 1447; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1448; VI-NEXT: v_mov_b32_e32 v0, s4 1449; VI-NEXT: v_mov_b32_e32 v1, s5 1450; VI-NEXT: v_mov_b32_e32 v2, s6 1451; VI-NEXT: flat_atomic_sub v[0:1], v2 1452; VI-NEXT: s_waitcnt vmcnt(0) 1453; VI-NEXT: buffer_wbinvl1_vol 1454; VI-NEXT: s_setpc_b64 s[30:31] 1455; 1456; GFX9-LABEL: global_atomic_sub_i32_noret_scalar: 1457; GFX9: ; %bb.0: 1458; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1459; GFX9-NEXT: v_mov_b32_e32 v0, 0 1460; GFX9-NEXT: v_mov_b32_e32 v1, s6 1461; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] 1462; GFX9-NEXT: s_waitcnt vmcnt(0) 1463; GFX9-NEXT: buffer_wbinvl1_vol 1464; GFX9-NEXT: s_setpc_b64 s[30:31] 1465 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst 1466 ret void 1467} 1468 1469define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 1470; SI-LABEL: global_atomic_sub_i32_noret_offset_scalar: 1471; SI: ; %bb.0: 1472; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1473; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1474; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1475; SI-NEXT: s_mov_b64 exec, s[34:35] 1476; SI-NEXT: s_waitcnt expcnt(0) 1477; SI-NEXT: v_writelane_b32 v1, s6, 0 1478; SI-NEXT: v_writelane_b32 v1, s7, 1 1479; SI-NEXT: s_mov_b32 s34, s6 1480; SI-NEXT: s_mov_b32 s7, 0xf000 1481; SI-NEXT: s_mov_b32 s6, -1 1482; SI-NEXT: v_mov_b32_e32 v0, s34 1483; SI-NEXT: s_waitcnt vmcnt(0) 1484; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 1485; SI-NEXT: s_waitcnt vmcnt(0) 1486; SI-NEXT: buffer_wbinvl1 1487; SI-NEXT: v_readlane_b32 s7, v1, 1 1488; SI-NEXT: v_readlane_b32 s6, v1, 0 1489; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1490; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1491; SI-NEXT: s_mov_b64 exec, s[34:35] 1492; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1493; SI-NEXT: s_setpc_b64 s[30:31] 1494; 1495; VI-LABEL: global_atomic_sub_i32_noret_offset_scalar: 1496; VI: ; %bb.0: 1497; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1498; VI-NEXT: s_add_u32 s34, s4, 16 1499; VI-NEXT: s_addc_u32 s35, s5, 0 1500; VI-NEXT: v_mov_b32_e32 v0, s34 1501; VI-NEXT: v_mov_b32_e32 v1, s35 1502; VI-NEXT: v_mov_b32_e32 v2, s6 1503; VI-NEXT: flat_atomic_sub v[0:1], v2 1504; VI-NEXT: s_waitcnt vmcnt(0) 1505; VI-NEXT: buffer_wbinvl1_vol 1506; VI-NEXT: s_setpc_b64 s[30:31] 1507; 1508; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar: 1509; GFX9: ; %bb.0: 1510; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1511; GFX9-NEXT: v_mov_b32_e32 v0, 0 1512; GFX9-NEXT: v_mov_b32_e32 v1, s6 1513; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16 1514; GFX9-NEXT: s_waitcnt vmcnt(0) 1515; GFX9-NEXT: buffer_wbinvl1_vol 1516; GFX9-NEXT: s_setpc_b64 s[30:31] 1517 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1518 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst 1519 ret void 1520} 1521 1522define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 1523; SI-LABEL: global_atomic_sub_i32_ret_scalar: 1524; SI: ; %bb.0: 1525; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1526; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1527; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1528; SI-NEXT: s_mov_b64 exec, s[34:35] 1529; SI-NEXT: s_waitcnt expcnt(0) 1530; SI-NEXT: v_writelane_b32 v1, s6, 0 1531; SI-NEXT: v_writelane_b32 v1, s7, 1 1532; SI-NEXT: s_mov_b32 s34, s6 1533; SI-NEXT: s_mov_b32 s7, 0xf000 1534; SI-NEXT: s_mov_b32 s6, -1 1535; SI-NEXT: v_mov_b32_e32 v0, s34 1536; SI-NEXT: s_waitcnt vmcnt(0) 1537; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1538; SI-NEXT: s_waitcnt vmcnt(0) 1539; SI-NEXT: buffer_wbinvl1 1540; SI-NEXT: v_readlane_b32 s7, v1, 1 1541; SI-NEXT: v_readlane_b32 s6, v1, 0 1542; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1543; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1544; SI-NEXT: s_mov_b64 exec, s[34:35] 1545; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1546; SI-NEXT: s_setpc_b64 s[30:31] 1547; 1548; VI-LABEL: global_atomic_sub_i32_ret_scalar: 1549; VI: ; %bb.0: 1550; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1551; VI-NEXT: v_mov_b32_e32 v0, s4 1552; VI-NEXT: v_mov_b32_e32 v1, s5 1553; VI-NEXT: v_mov_b32_e32 v2, s6 1554; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1555; VI-NEXT: s_waitcnt vmcnt(0) 1556; VI-NEXT: buffer_wbinvl1_vol 1557; VI-NEXT: s_setpc_b64 s[30:31] 1558; 1559; GFX9-LABEL: global_atomic_sub_i32_ret_scalar: 1560; GFX9: ; %bb.0: 1561; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1562; GFX9-NEXT: v_mov_b32_e32 v0, 0 1563; GFX9-NEXT: v_mov_b32_e32 v1, s6 1564; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc 1565; GFX9-NEXT: s_waitcnt vmcnt(0) 1566; GFX9-NEXT: buffer_wbinvl1_vol 1567; GFX9-NEXT: s_setpc_b64 s[30:31] 1568 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst 1569 ret i32 %result 1570} 1571 1572define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 1573; SI-LABEL: global_atomic_sub_i32_ret_offset_scalar: 1574; SI: ; %bb.0: 1575; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1576; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1577; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1578; SI-NEXT: s_mov_b64 exec, s[34:35] 1579; SI-NEXT: s_waitcnt expcnt(0) 1580; SI-NEXT: v_writelane_b32 v1, s6, 0 1581; SI-NEXT: v_writelane_b32 v1, s7, 1 1582; SI-NEXT: s_mov_b32 s34, s6 1583; SI-NEXT: s_mov_b32 s7, 0xf000 1584; SI-NEXT: s_mov_b32 s6, -1 1585; SI-NEXT: v_mov_b32_e32 v0, s34 1586; SI-NEXT: s_waitcnt vmcnt(0) 1587; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc 1588; SI-NEXT: s_waitcnt vmcnt(0) 1589; SI-NEXT: buffer_wbinvl1 1590; SI-NEXT: v_readlane_b32 s7, v1, 1 1591; SI-NEXT: v_readlane_b32 s6, v1, 0 1592; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1593; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1594; SI-NEXT: s_mov_b64 exec, s[34:35] 1595; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1596; SI-NEXT: s_setpc_b64 s[30:31] 1597; 1598; VI-LABEL: global_atomic_sub_i32_ret_offset_scalar: 1599; VI: ; %bb.0: 1600; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1601; VI-NEXT: s_add_u32 s34, s4, 16 1602; VI-NEXT: s_addc_u32 s35, s5, 0 1603; VI-NEXT: v_mov_b32_e32 v0, s34 1604; VI-NEXT: v_mov_b32_e32 v1, s35 1605; VI-NEXT: v_mov_b32_e32 v2, s6 1606; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1607; VI-NEXT: s_waitcnt vmcnt(0) 1608; VI-NEXT: buffer_wbinvl1_vol 1609; VI-NEXT: s_setpc_b64 s[30:31] 1610; 1611; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar: 1612; GFX9: ; %bb.0: 1613; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1614; GFX9-NEXT: v_mov_b32_e32 v0, 0 1615; GFX9-NEXT: v_mov_b32_e32 v1, s6 1616; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc 1617; GFX9-NEXT: s_waitcnt vmcnt(0) 1618; GFX9-NEXT: buffer_wbinvl1_vol 1619; GFX9-NEXT: s_setpc_b64 s[30:31] 1620 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1621 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst 1622 ret i32 %result 1623} 1624 1625define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) { 1626; SI-LABEL: global_atomic_sub_0_i32_ret: 1627; SI: ; %bb.0: 1628; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1629; SI-NEXT: s_mov_b32 s7, 0xf000 1630; SI-NEXT: s_mov_b32 s6, 0 1631; SI-NEXT: v_mov_b32_e32 v2, 0 1632; SI-NEXT: s_mov_b32 s4, s6 1633; SI-NEXT: s_mov_b32 s5, s6 1634; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc 1635; SI-NEXT: s_waitcnt vmcnt(0) 1636; SI-NEXT: buffer_wbinvl1 1637; SI-NEXT: v_mov_b32_e32 v0, v2 1638; SI-NEXT: s_waitcnt expcnt(0) 1639; SI-NEXT: s_setpc_b64 s[30:31] 1640; 1641; VI-LABEL: global_atomic_sub_0_i32_ret: 1642; VI: ; %bb.0: 1643; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1644; VI-NEXT: v_mov_b32_e32 v2, 0 1645; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 1646; VI-NEXT: s_waitcnt vmcnt(0) 1647; VI-NEXT: buffer_wbinvl1_vol 1648; VI-NEXT: s_setpc_b64 s[30:31] 1649; 1650; GFX9-LABEL: global_atomic_sub_0_i32_ret: 1651; GFX9: ; %bb.0: 1652; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1653; GFX9-NEXT: v_mov_b32_e32 v2, 0 1654; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc 1655; GFX9-NEXT: s_waitcnt vmcnt(0) 1656; GFX9-NEXT: buffer_wbinvl1_vol 1657; GFX9-NEXT: s_setpc_b64 s[30:31] 1658 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst 1659 ret i32 %result 1660} 1661 1662define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 1663; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: 1664; SI: ; %bb.0: 1665; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1666; SI-NEXT: s_mov_b32 s6, 0 1667; SI-NEXT: s_mov_b32 s7, 0xf000 1668; SI-NEXT: s_mov_b32 s4, s6 1669; SI-NEXT: s_mov_b32 s5, s6 1670; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 1671; SI-NEXT: s_waitcnt vmcnt(0) 1672; SI-NEXT: buffer_wbinvl1 1673; SI-NEXT: s_waitcnt expcnt(0) 1674; SI-NEXT: s_setpc_b64 s[30:31] 1675; 1676; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: 1677; VI: ; %bb.0: 1678; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1679; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1680; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1681; VI-NEXT: flat_atomic_sub v[0:1], v2 1682; VI-NEXT: s_waitcnt vmcnt(0) 1683; VI-NEXT: buffer_wbinvl1_vol 1684; VI-NEXT: s_setpc_b64 s[30:31] 1685; 1686; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: 1687; GFX9: ; %bb.0: 1688; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1689; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 1690; GFX9-NEXT: s_waitcnt vmcnt(0) 1691; GFX9-NEXT: buffer_wbinvl1_vol 1692; GFX9-NEXT: s_setpc_b64 s[30:31] 1693 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1694 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1695 ret void 1696} 1697 1698define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 1699; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: 1700; SI: ; %bb.0: 1701; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1702; SI-NEXT: s_mov_b32 s6, 0 1703; SI-NEXT: s_mov_b32 s7, 0xf000 1704; SI-NEXT: s_mov_b32 s4, s6 1705; SI-NEXT: s_mov_b32 s5, s6 1706; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 1707; SI-NEXT: s_waitcnt vmcnt(0) 1708; SI-NEXT: buffer_wbinvl1 1709; SI-NEXT: v_mov_b32_e32 v0, v2 1710; SI-NEXT: s_waitcnt expcnt(0) 1711; SI-NEXT: s_setpc_b64 s[30:31] 1712; 1713; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: 1714; VI: ; %bb.0: 1715; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1716; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1717; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1718; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1719; VI-NEXT: s_waitcnt vmcnt(0) 1720; VI-NEXT: buffer_wbinvl1_vol 1721; VI-NEXT: s_setpc_b64 s[30:31] 1722; 1723; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: 1724; GFX9: ; %bb.0: 1725; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1726; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc 1727; GFX9-NEXT: s_waitcnt vmcnt(0) 1728; GFX9-NEXT: buffer_wbinvl1_vol 1729; GFX9-NEXT: s_setpc_b64 s[30:31] 1730 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1731 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1732 ret i32 %result 1733} 1734 1735; --------------------------------------------------------------------- 1736; atomicrmw and 1737; --------------------------------------------------------------------- 1738 1739define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 1740; SI-LABEL: global_atomic_and_i32_noret: 1741; SI: ; %bb.0: 1742; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1743; SI-NEXT: s_mov_b32 s6, 0 1744; SI-NEXT: s_mov_b32 s7, 0xf000 1745; SI-NEXT: s_mov_b32 s4, s6 1746; SI-NEXT: s_mov_b32 s5, s6 1747; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 1748; SI-NEXT: s_waitcnt vmcnt(0) 1749; SI-NEXT: buffer_wbinvl1 1750; SI-NEXT: s_waitcnt expcnt(0) 1751; SI-NEXT: s_setpc_b64 s[30:31] 1752; 1753; VI-LABEL: global_atomic_and_i32_noret: 1754; VI: ; %bb.0: 1755; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1756; VI-NEXT: flat_atomic_and v[0:1], v2 1757; VI-NEXT: s_waitcnt vmcnt(0) 1758; VI-NEXT: buffer_wbinvl1_vol 1759; VI-NEXT: s_setpc_b64 s[30:31] 1760; 1761; GFX9-LABEL: global_atomic_and_i32_noret: 1762; GFX9: ; %bb.0: 1763; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1764; GFX9-NEXT: global_atomic_and v[0:1], v2, off 1765; GFX9-NEXT: s_waitcnt vmcnt(0) 1766; GFX9-NEXT: buffer_wbinvl1_vol 1767; GFX9-NEXT: s_setpc_b64 s[30:31] 1768 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst 1769 ret void 1770} 1771 1772define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 1773; SI-LABEL: global_atomic_and_i32_noret_offset: 1774; SI: ; %bb.0: 1775; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1776; SI-NEXT: s_mov_b32 s6, 0 1777; SI-NEXT: s_mov_b32 s7, 0xf000 1778; SI-NEXT: s_mov_b32 s4, s6 1779; SI-NEXT: s_mov_b32 s5, s6 1780; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 1781; SI-NEXT: s_waitcnt vmcnt(0) 1782; SI-NEXT: buffer_wbinvl1 1783; SI-NEXT: s_waitcnt expcnt(0) 1784; SI-NEXT: s_setpc_b64 s[30:31] 1785; 1786; VI-LABEL: global_atomic_and_i32_noret_offset: 1787; VI: ; %bb.0: 1788; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1789; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1790; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1791; VI-NEXT: flat_atomic_and v[0:1], v2 1792; VI-NEXT: s_waitcnt vmcnt(0) 1793; VI-NEXT: buffer_wbinvl1_vol 1794; VI-NEXT: s_setpc_b64 s[30:31] 1795; 1796; GFX9-LABEL: global_atomic_and_i32_noret_offset: 1797; GFX9: ; %bb.0: 1798; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1799; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 1800; GFX9-NEXT: s_waitcnt vmcnt(0) 1801; GFX9-NEXT: buffer_wbinvl1_vol 1802; GFX9-NEXT: s_setpc_b64 s[30:31] 1803 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1804 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst 1805 ret void 1806} 1807 1808define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 1809; SI-LABEL: global_atomic_and_i32_ret: 1810; SI: ; %bb.0: 1811; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1812; SI-NEXT: s_mov_b32 s6, 0 1813; SI-NEXT: s_mov_b32 s7, 0xf000 1814; SI-NEXT: s_mov_b32 s4, s6 1815; SI-NEXT: s_mov_b32 s5, s6 1816; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc 1817; SI-NEXT: s_waitcnt vmcnt(0) 1818; SI-NEXT: buffer_wbinvl1 1819; SI-NEXT: v_mov_b32_e32 v0, v2 1820; SI-NEXT: s_waitcnt expcnt(0) 1821; SI-NEXT: s_setpc_b64 s[30:31] 1822; 1823; VI-LABEL: global_atomic_and_i32_ret: 1824; VI: ; %bb.0: 1825; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1826; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1827; VI-NEXT: s_waitcnt vmcnt(0) 1828; VI-NEXT: buffer_wbinvl1_vol 1829; VI-NEXT: s_setpc_b64 s[30:31] 1830; 1831; GFX9-LABEL: global_atomic_and_i32_ret: 1832; GFX9: ; %bb.0: 1833; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1834; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc 1835; GFX9-NEXT: s_waitcnt vmcnt(0) 1836; GFX9-NEXT: buffer_wbinvl1_vol 1837; GFX9-NEXT: s_setpc_b64 s[30:31] 1838 %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst 1839 ret i32 %result 1840} 1841 1842define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 1843; SI-LABEL: global_atomic_and_i32_ret_offset: 1844; SI: ; %bb.0: 1845; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1846; SI-NEXT: s_mov_b32 s6, 0 1847; SI-NEXT: s_mov_b32 s7, 0xf000 1848; SI-NEXT: s_mov_b32 s4, s6 1849; SI-NEXT: s_mov_b32 s5, s6 1850; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 1851; SI-NEXT: s_waitcnt vmcnt(0) 1852; SI-NEXT: buffer_wbinvl1 1853; SI-NEXT: v_mov_b32_e32 v0, v2 1854; SI-NEXT: s_waitcnt expcnt(0) 1855; SI-NEXT: s_setpc_b64 s[30:31] 1856; 1857; VI-LABEL: global_atomic_and_i32_ret_offset: 1858; VI: ; %bb.0: 1859; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1860; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1861; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1862; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1863; VI-NEXT: s_waitcnt vmcnt(0) 1864; VI-NEXT: buffer_wbinvl1_vol 1865; VI-NEXT: s_setpc_b64 s[30:31] 1866; 1867; GFX9-LABEL: global_atomic_and_i32_ret_offset: 1868; GFX9: ; %bb.0: 1869; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1870; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc 1871; GFX9-NEXT: s_waitcnt vmcnt(0) 1872; GFX9-NEXT: buffer_wbinvl1_vol 1873; GFX9-NEXT: s_setpc_b64 s[30:31] 1874 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1875 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst 1876 ret i32 %result 1877} 1878 1879define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 1880; SI-LABEL: global_atomic_and_i32_noret_scalar: 1881; SI: ; %bb.0: 1882; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1883; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1884; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1885; SI-NEXT: s_mov_b64 exec, s[34:35] 1886; SI-NEXT: s_waitcnt expcnt(0) 1887; SI-NEXT: v_writelane_b32 v1, s6, 0 1888; SI-NEXT: v_writelane_b32 v1, s7, 1 1889; SI-NEXT: s_mov_b32 s34, s6 1890; SI-NEXT: s_mov_b32 s7, 0xf000 1891; SI-NEXT: s_mov_b32 s6, -1 1892; SI-NEXT: v_mov_b32_e32 v0, s34 1893; SI-NEXT: s_waitcnt vmcnt(0) 1894; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 1895; SI-NEXT: s_waitcnt vmcnt(0) 1896; SI-NEXT: buffer_wbinvl1 1897; SI-NEXT: v_readlane_b32 s7, v1, 1 1898; SI-NEXT: v_readlane_b32 s6, v1, 0 1899; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1900; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1901; SI-NEXT: s_mov_b64 exec, s[34:35] 1902; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1903; SI-NEXT: s_setpc_b64 s[30:31] 1904; 1905; VI-LABEL: global_atomic_and_i32_noret_scalar: 1906; VI: ; %bb.0: 1907; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1908; VI-NEXT: v_mov_b32_e32 v0, s4 1909; VI-NEXT: v_mov_b32_e32 v1, s5 1910; VI-NEXT: v_mov_b32_e32 v2, s6 1911; VI-NEXT: flat_atomic_and v[0:1], v2 1912; VI-NEXT: s_waitcnt vmcnt(0) 1913; VI-NEXT: buffer_wbinvl1_vol 1914; VI-NEXT: s_setpc_b64 s[30:31] 1915; 1916; GFX9-LABEL: global_atomic_and_i32_noret_scalar: 1917; GFX9: ; %bb.0: 1918; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1919; GFX9-NEXT: v_mov_b32_e32 v0, 0 1920; GFX9-NEXT: v_mov_b32_e32 v1, s6 1921; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] 1922; GFX9-NEXT: s_waitcnt vmcnt(0) 1923; GFX9-NEXT: buffer_wbinvl1_vol 1924; GFX9-NEXT: s_setpc_b64 s[30:31] 1925 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst 1926 ret void 1927} 1928 1929define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 1930; SI-LABEL: global_atomic_and_i32_noret_offset_scalar: 1931; SI: ; %bb.0: 1932; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1933; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1934; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1935; SI-NEXT: s_mov_b64 exec, s[34:35] 1936; SI-NEXT: s_waitcnt expcnt(0) 1937; SI-NEXT: v_writelane_b32 v1, s6, 0 1938; SI-NEXT: v_writelane_b32 v1, s7, 1 1939; SI-NEXT: s_mov_b32 s34, s6 1940; SI-NEXT: s_mov_b32 s7, 0xf000 1941; SI-NEXT: s_mov_b32 s6, -1 1942; SI-NEXT: v_mov_b32_e32 v0, s34 1943; SI-NEXT: s_waitcnt vmcnt(0) 1944; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 1945; SI-NEXT: s_waitcnt vmcnt(0) 1946; SI-NEXT: buffer_wbinvl1 1947; SI-NEXT: v_readlane_b32 s7, v1, 1 1948; SI-NEXT: v_readlane_b32 s6, v1, 0 1949; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1950; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 1951; SI-NEXT: s_mov_b64 exec, s[34:35] 1952; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1953; SI-NEXT: s_setpc_b64 s[30:31] 1954; 1955; VI-LABEL: global_atomic_and_i32_noret_offset_scalar: 1956; VI: ; %bb.0: 1957; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1958; VI-NEXT: s_add_u32 s34, s4, 16 1959; VI-NEXT: s_addc_u32 s35, s5, 0 1960; VI-NEXT: v_mov_b32_e32 v0, s34 1961; VI-NEXT: v_mov_b32_e32 v1, s35 1962; VI-NEXT: v_mov_b32_e32 v2, s6 1963; VI-NEXT: flat_atomic_and v[0:1], v2 1964; VI-NEXT: s_waitcnt vmcnt(0) 1965; VI-NEXT: buffer_wbinvl1_vol 1966; VI-NEXT: s_setpc_b64 s[30:31] 1967; 1968; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar: 1969; GFX9: ; %bb.0: 1970; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1971; GFX9-NEXT: v_mov_b32_e32 v0, 0 1972; GFX9-NEXT: v_mov_b32_e32 v1, s6 1973; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16 1974; GFX9-NEXT: s_waitcnt vmcnt(0) 1975; GFX9-NEXT: buffer_wbinvl1_vol 1976; GFX9-NEXT: s_setpc_b64 s[30:31] 1977 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 1978 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst 1979 ret void 1980} 1981 1982define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 1983; SI-LABEL: global_atomic_and_i32_ret_scalar: 1984; SI: ; %bb.0: 1985; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1986; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1987; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 1988; SI-NEXT: s_mov_b64 exec, s[34:35] 1989; SI-NEXT: s_waitcnt expcnt(0) 1990; SI-NEXT: v_writelane_b32 v1, s6, 0 1991; SI-NEXT: v_writelane_b32 v1, s7, 1 1992; SI-NEXT: s_mov_b32 s34, s6 1993; SI-NEXT: s_mov_b32 s7, 0xf000 1994; SI-NEXT: s_mov_b32 s6, -1 1995; SI-NEXT: v_mov_b32_e32 v0, s34 1996; SI-NEXT: s_waitcnt vmcnt(0) 1997; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc 1998; SI-NEXT: s_waitcnt vmcnt(0) 1999; SI-NEXT: buffer_wbinvl1 2000; SI-NEXT: v_readlane_b32 s7, v1, 1 2001; SI-NEXT: v_readlane_b32 s6, v1, 0 2002; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2003; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 2004; SI-NEXT: s_mov_b64 exec, s[34:35] 2005; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2006; SI-NEXT: s_setpc_b64 s[30:31] 2007; 2008; VI-LABEL: global_atomic_and_i32_ret_scalar: 2009; VI: ; %bb.0: 2010; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2011; VI-NEXT: v_mov_b32_e32 v0, s4 2012; VI-NEXT: v_mov_b32_e32 v1, s5 2013; VI-NEXT: v_mov_b32_e32 v2, s6 2014; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc 2015; VI-NEXT: s_waitcnt vmcnt(0) 2016; VI-NEXT: buffer_wbinvl1_vol 2017; VI-NEXT: s_setpc_b64 s[30:31] 2018; 2019; GFX9-LABEL: global_atomic_and_i32_ret_scalar: 2020; GFX9: ; %bb.0: 2021; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2022; GFX9-NEXT: v_mov_b32_e32 v0, 0 2023; GFX9-NEXT: v_mov_b32_e32 v1, s6 2024; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc 2025; GFX9-NEXT: s_waitcnt vmcnt(0) 2026; GFX9-NEXT: buffer_wbinvl1_vol 2027; GFX9-NEXT: s_setpc_b64 s[30:31] 2028 %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst 2029 ret i32 %result 2030} 2031 2032define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 2033; SI-LABEL: global_atomic_and_i32_ret_offset_scalar: 2034; SI: ; %bb.0: 2035; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2036; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2037; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 2038; SI-NEXT: s_mov_b64 exec, s[34:35] 2039; SI-NEXT: s_waitcnt expcnt(0) 2040; SI-NEXT: v_writelane_b32 v1, s6, 0 2041; SI-NEXT: v_writelane_b32 v1, s7, 1 2042; SI-NEXT: s_mov_b32 s34, s6 2043; SI-NEXT: s_mov_b32 s7, 0xf000 2044; SI-NEXT: s_mov_b32 s6, -1 2045; SI-NEXT: v_mov_b32_e32 v0, s34 2046; SI-NEXT: s_waitcnt vmcnt(0) 2047; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc 2048; SI-NEXT: s_waitcnt vmcnt(0) 2049; SI-NEXT: buffer_wbinvl1 2050; SI-NEXT: v_readlane_b32 s7, v1, 1 2051; SI-NEXT: v_readlane_b32 s6, v1, 0 2052; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2053; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 2054; SI-NEXT: s_mov_b64 exec, s[34:35] 2055; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2056; SI-NEXT: s_setpc_b64 s[30:31] 2057; 2058; VI-LABEL: global_atomic_and_i32_ret_offset_scalar: 2059; VI: ; %bb.0: 2060; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2061; VI-NEXT: s_add_u32 s34, s4, 16 2062; VI-NEXT: s_addc_u32 s35, s5, 0 2063; VI-NEXT: v_mov_b32_e32 v0, s34 2064; VI-NEXT: v_mov_b32_e32 v1, s35 2065; VI-NEXT: v_mov_b32_e32 v2, s6 2066; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc 2067; VI-NEXT: s_waitcnt vmcnt(0) 2068; VI-NEXT: buffer_wbinvl1_vol 2069; VI-NEXT: s_setpc_b64 s[30:31] 2070; 2071; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar: 2072; GFX9: ; %bb.0: 2073; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2074; GFX9-NEXT: v_mov_b32_e32 v0, 0 2075; GFX9-NEXT: v_mov_b32_e32 v1, s6 2076; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc 2077; GFX9-NEXT: s_waitcnt vmcnt(0) 2078; GFX9-NEXT: buffer_wbinvl1_vol 2079; GFX9-NEXT: s_setpc_b64 s[30:31] 2080 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 2081 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst 2082 ret i32 %result 2083} 2084 2085define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 2086; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: 2087; SI: ; %bb.0: 2088; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2089; SI-NEXT: s_mov_b32 s6, 0 2090; SI-NEXT: s_mov_b32 s7, 0xf000 2091; SI-NEXT: s_mov_b32 s4, s6 2092; SI-NEXT: s_mov_b32 s5, s6 2093; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 2094; SI-NEXT: s_waitcnt vmcnt(0) 2095; SI-NEXT: buffer_wbinvl1 2096; SI-NEXT: s_waitcnt expcnt(0) 2097; SI-NEXT: s_setpc_b64 s[30:31] 2098; 2099; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: 2100; VI: ; %bb.0: 2101; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2102; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2103; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2104; VI-NEXT: flat_atomic_and v[0:1], v2 2105; VI-NEXT: s_waitcnt vmcnt(0) 2106; VI-NEXT: buffer_wbinvl1_vol 2107; VI-NEXT: s_setpc_b64 s[30:31] 2108; 2109; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: 2110; GFX9: ; %bb.0: 2111; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2112; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 2113; GFX9-NEXT: s_waitcnt vmcnt(0) 2114; GFX9-NEXT: buffer_wbinvl1_vol 2115; GFX9-NEXT: s_setpc_b64 s[30:31] 2116 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 2117 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2118 ret void 2119} 2120 2121define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 2122; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: 2123; SI: ; %bb.0: 2124; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2125; SI-NEXT: s_mov_b32 s6, 0 2126; SI-NEXT: s_mov_b32 s7, 0xf000 2127; SI-NEXT: s_mov_b32 s4, s6 2128; SI-NEXT: s_mov_b32 s5, s6 2129; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 2130; SI-NEXT: s_waitcnt vmcnt(0) 2131; SI-NEXT: buffer_wbinvl1 2132; SI-NEXT: v_mov_b32_e32 v0, v2 2133; SI-NEXT: s_waitcnt expcnt(0) 2134; SI-NEXT: s_setpc_b64 s[30:31] 2135; 2136; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: 2137; VI: ; %bb.0: 2138; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2139; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2140; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2141; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc 2142; VI-NEXT: s_waitcnt vmcnt(0) 2143; VI-NEXT: buffer_wbinvl1_vol 2144; VI-NEXT: s_setpc_b64 s[30:31] 2145; 2146; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: 2147; GFX9: ; %bb.0: 2148; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2149; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc 2150; GFX9-NEXT: s_waitcnt vmcnt(0) 2151; GFX9-NEXT: buffer_wbinvl1_vol 2152; GFX9-NEXT: s_setpc_b64 s[30:31] 2153 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 2154 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2155 ret i32 %result 2156} 2157 2158; --------------------------------------------------------------------- 2159; atomicrmw nand 2160; --------------------------------------------------------------------- 2161 2162define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 2163; SI-LABEL: global_atomic_nand_i32_noret: 2164; SI: ; %bb.0: 2165; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2166; SI-NEXT: s_mov_b32 s6, 0 2167; SI-NEXT: s_mov_b32 s7, 0xf000 2168; SI-NEXT: s_mov_b32 s4, s6 2169; SI-NEXT: s_mov_b32 s5, s6 2170; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 2171; SI-NEXT: s_mov_b64 s[8:9], 0 2172; SI-NEXT: .LBB51_1: ; %atomicrmw.start 2173; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2174; SI-NEXT: s_waitcnt vmcnt(0) 2175; SI-NEXT: v_and_b32_e32 v3, v4, v2 2176; SI-NEXT: v_not_b32_e32 v3, v3 2177; SI-NEXT: s_waitcnt expcnt(0) 2178; SI-NEXT: v_mov_b32_e32 v6, v4 2179; SI-NEXT: v_mov_b32_e32 v5, v3 2180; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc 2181; SI-NEXT: s_waitcnt vmcnt(0) 2182; SI-NEXT: buffer_wbinvl1 2183; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 2184; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2185; SI-NEXT: v_mov_b32_e32 v4, v5 2186; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2187; SI-NEXT: s_cbranch_execnz .LBB51_1 2188; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2189; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2190; SI-NEXT: s_waitcnt expcnt(0) 2191; SI-NEXT: s_setpc_b64 s[30:31] 2192; 2193; VI-LABEL: global_atomic_nand_i32_noret: 2194; VI: ; %bb.0: 2195; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2196; VI-NEXT: flat_load_dword v4, v[0:1] 2197; VI-NEXT: s_mov_b64 s[4:5], 0 2198; VI-NEXT: .LBB51_1: ; %atomicrmw.start 2199; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2200; VI-NEXT: s_waitcnt vmcnt(0) 2201; VI-NEXT: v_and_b32_e32 v3, v4, v2 2202; VI-NEXT: v_not_b32_e32 v3, v3 2203; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2204; VI-NEXT: s_waitcnt vmcnt(0) 2205; VI-NEXT: buffer_wbinvl1_vol 2206; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2207; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2208; VI-NEXT: v_mov_b32_e32 v4, v3 2209; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2210; VI-NEXT: s_cbranch_execnz .LBB51_1 2211; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2212; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2213; VI-NEXT: s_setpc_b64 s[30:31] 2214; 2215; GFX9-LABEL: global_atomic_nand_i32_noret: 2216; GFX9: ; %bb.0: 2217; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2218; GFX9-NEXT: global_load_dword v4, v[0:1], off 2219; GFX9-NEXT: s_mov_b64 s[4:5], 0 2220; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start 2221; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2222; GFX9-NEXT: s_waitcnt vmcnt(0) 2223; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 2224; GFX9-NEXT: v_not_b32_e32 v3, v3 2225; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 2226; GFX9-NEXT: s_waitcnt vmcnt(0) 2227; GFX9-NEXT: buffer_wbinvl1_vol 2228; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2229; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2230; GFX9-NEXT: v_mov_b32_e32 v4, v3 2231; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2232; GFX9-NEXT: s_cbranch_execnz .LBB51_1 2233; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2234; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2235; GFX9-NEXT: s_setpc_b64 s[30:31] 2236 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst 2237 ret void 2238} 2239 2240define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 2241; SI-LABEL: global_atomic_nand_i32_noret_offset: 2242; SI: ; %bb.0: 2243; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2244; SI-NEXT: s_mov_b32 s6, 0 2245; SI-NEXT: s_mov_b32 s7, 0xf000 2246; SI-NEXT: s_mov_b32 s4, s6 2247; SI-NEXT: s_mov_b32 s5, s6 2248; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 2249; SI-NEXT: s_mov_b64 s[8:9], 0 2250; SI-NEXT: .LBB52_1: ; %atomicrmw.start 2251; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2252; SI-NEXT: s_waitcnt vmcnt(0) 2253; SI-NEXT: v_and_b32_e32 v3, v4, v2 2254; SI-NEXT: v_not_b32_e32 v3, v3 2255; SI-NEXT: s_waitcnt expcnt(0) 2256; SI-NEXT: v_mov_b32_e32 v6, v4 2257; SI-NEXT: v_mov_b32_e32 v5, v3 2258; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 2259; SI-NEXT: s_waitcnt vmcnt(0) 2260; SI-NEXT: buffer_wbinvl1 2261; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 2262; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2263; SI-NEXT: v_mov_b32_e32 v4, v5 2264; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2265; SI-NEXT: s_cbranch_execnz .LBB52_1 2266; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2267; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2268; SI-NEXT: s_waitcnt expcnt(0) 2269; SI-NEXT: s_setpc_b64 s[30:31] 2270; 2271; VI-LABEL: global_atomic_nand_i32_noret_offset: 2272; VI: ; %bb.0: 2273; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2274; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2275; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2276; VI-NEXT: flat_load_dword v4, v[0:1] 2277; VI-NEXT: s_mov_b64 s[4:5], 0 2278; VI-NEXT: .LBB52_1: ; %atomicrmw.start 2279; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2280; VI-NEXT: s_waitcnt vmcnt(0) 2281; VI-NEXT: v_and_b32_e32 v3, v4, v2 2282; VI-NEXT: v_not_b32_e32 v3, v3 2283; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2284; VI-NEXT: s_waitcnt vmcnt(0) 2285; VI-NEXT: buffer_wbinvl1_vol 2286; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2287; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2288; VI-NEXT: v_mov_b32_e32 v4, v3 2289; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2290; VI-NEXT: s_cbranch_execnz .LBB52_1 2291; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2292; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2293; VI-NEXT: s_setpc_b64 s[30:31] 2294; 2295; GFX9-LABEL: global_atomic_nand_i32_noret_offset: 2296; GFX9: ; %bb.0: 2297; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2298; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 2299; GFX9-NEXT: s_mov_b64 s[4:5], 0 2300; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start 2301; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2302; GFX9-NEXT: s_waitcnt vmcnt(0) 2303; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 2304; GFX9-NEXT: v_not_b32_e32 v3, v3 2305; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 2306; GFX9-NEXT: s_waitcnt vmcnt(0) 2307; GFX9-NEXT: buffer_wbinvl1_vol 2308; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2309; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2310; GFX9-NEXT: v_mov_b32_e32 v4, v3 2311; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2312; GFX9-NEXT: s_cbranch_execnz .LBB52_1 2313; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2314; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2315; GFX9-NEXT: s_setpc_b64 s[30:31] 2316 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 2317 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst 2318 ret void 2319} 2320 2321define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 2322; SI-LABEL: global_atomic_nand_i32_ret: 2323; SI: ; %bb.0: 2324; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2325; SI-NEXT: s_mov_b32 s6, 0 2326; SI-NEXT: s_mov_b32 s7, 0xf000 2327; SI-NEXT: s_mov_b32 s4, s6 2328; SI-NEXT: s_mov_b32 s5, s6 2329; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 2330; SI-NEXT: s_mov_b64 s[8:9], 0 2331; SI-NEXT: .LBB53_1: ; %atomicrmw.start 2332; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2333; SI-NEXT: s_waitcnt vmcnt(0) 2334; SI-NEXT: v_mov_b32_e32 v5, v3 2335; SI-NEXT: s_waitcnt expcnt(0) 2336; SI-NEXT: v_and_b32_e32 v3, v5, v2 2337; SI-NEXT: v_not_b32_e32 v4, v3 2338; SI-NEXT: v_mov_b32_e32 v3, v4 2339; SI-NEXT: v_mov_b32_e32 v4, v5 2340; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc 2341; SI-NEXT: s_waitcnt vmcnt(0) 2342; SI-NEXT: buffer_wbinvl1 2343; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2344; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2345; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2346; SI-NEXT: s_cbranch_execnz .LBB53_1 2347; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2348; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2349; SI-NEXT: v_mov_b32_e32 v0, v3 2350; SI-NEXT: s_waitcnt expcnt(0) 2351; SI-NEXT: s_setpc_b64 s[30:31] 2352; 2353; VI-LABEL: global_atomic_nand_i32_ret: 2354; VI: ; %bb.0: 2355; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2356; VI-NEXT: flat_load_dword v3, v[0:1] 2357; VI-NEXT: s_mov_b64 s[4:5], 0 2358; VI-NEXT: .LBB53_1: ; %atomicrmw.start 2359; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2360; VI-NEXT: s_waitcnt vmcnt(0) 2361; VI-NEXT: v_mov_b32_e32 v4, v3 2362; VI-NEXT: v_and_b32_e32 v3, v4, v2 2363; VI-NEXT: v_not_b32_e32 v3, v3 2364; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2365; VI-NEXT: s_waitcnt vmcnt(0) 2366; VI-NEXT: buffer_wbinvl1_vol 2367; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2368; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2369; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2370; VI-NEXT: s_cbranch_execnz .LBB53_1 2371; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2372; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2373; VI-NEXT: v_mov_b32_e32 v0, v3 2374; VI-NEXT: s_setpc_b64 s[30:31] 2375; 2376; GFX9-LABEL: global_atomic_nand_i32_ret: 2377; GFX9: ; %bb.0: 2378; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2379; GFX9-NEXT: global_load_dword v3, v[0:1], off 2380; GFX9-NEXT: s_mov_b64 s[4:5], 0 2381; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start 2382; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2383; GFX9-NEXT: s_waitcnt vmcnt(0) 2384; GFX9-NEXT: v_mov_b32_e32 v4, v3 2385; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 2386; GFX9-NEXT: v_not_b32_e32 v3, v3 2387; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 2388; GFX9-NEXT: s_waitcnt vmcnt(0) 2389; GFX9-NEXT: buffer_wbinvl1_vol 2390; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2391; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2392; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2393; GFX9-NEXT: s_cbranch_execnz .LBB53_1 2394; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2395; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2396; GFX9-NEXT: v_mov_b32_e32 v0, v3 2397; GFX9-NEXT: s_setpc_b64 s[30:31] 2398 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst 2399 ret i32 %result 2400} 2401 2402define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 2403; SI-LABEL: global_atomic_nand_i32_ret_offset: 2404; SI: ; %bb.0: 2405; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2406; SI-NEXT: s_mov_b32 s6, 0 2407; SI-NEXT: s_mov_b32 s7, 0xf000 2408; SI-NEXT: s_mov_b32 s4, s6 2409; SI-NEXT: s_mov_b32 s5, s6 2410; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 2411; SI-NEXT: s_mov_b64 s[8:9], 0 2412; SI-NEXT: .LBB54_1: ; %atomicrmw.start 2413; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2414; SI-NEXT: s_waitcnt vmcnt(0) 2415; SI-NEXT: v_mov_b32_e32 v5, v3 2416; SI-NEXT: s_waitcnt expcnt(0) 2417; SI-NEXT: v_and_b32_e32 v3, v5, v2 2418; SI-NEXT: v_not_b32_e32 v4, v3 2419; SI-NEXT: v_mov_b32_e32 v3, v4 2420; SI-NEXT: v_mov_b32_e32 v4, v5 2421; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 2422; SI-NEXT: s_waitcnt vmcnt(0) 2423; SI-NEXT: buffer_wbinvl1 2424; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2425; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2426; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2427; SI-NEXT: s_cbranch_execnz .LBB54_1 2428; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2429; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2430; SI-NEXT: v_mov_b32_e32 v0, v3 2431; SI-NEXT: s_waitcnt expcnt(0) 2432; SI-NEXT: s_setpc_b64 s[30:31] 2433; 2434; VI-LABEL: global_atomic_nand_i32_ret_offset: 2435; VI: ; %bb.0: 2436; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2437; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 2438; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2439; VI-NEXT: flat_load_dword v0, v[3:4] 2440; VI-NEXT: s_mov_b64 s[4:5], 0 2441; VI-NEXT: .LBB54_1: ; %atomicrmw.start 2442; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2443; VI-NEXT: s_waitcnt vmcnt(0) 2444; VI-NEXT: v_mov_b32_e32 v1, v0 2445; VI-NEXT: v_and_b32_e32 v0, v1, v2 2446; VI-NEXT: v_not_b32_e32 v0, v0 2447; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2448; VI-NEXT: s_waitcnt vmcnt(0) 2449; VI-NEXT: buffer_wbinvl1_vol 2450; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2451; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2452; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2453; VI-NEXT: s_cbranch_execnz .LBB54_1 2454; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2455; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2456; VI-NEXT: s_setpc_b64 s[30:31] 2457; 2458; GFX9-LABEL: global_atomic_nand_i32_ret_offset: 2459; GFX9: ; %bb.0: 2460; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2461; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 2462; GFX9-NEXT: s_mov_b64 s[4:5], 0 2463; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start 2464; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2465; GFX9-NEXT: s_waitcnt vmcnt(0) 2466; GFX9-NEXT: v_mov_b32_e32 v4, v3 2467; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 2468; GFX9-NEXT: v_not_b32_e32 v3, v3 2469; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 2470; GFX9-NEXT: s_waitcnt vmcnt(0) 2471; GFX9-NEXT: buffer_wbinvl1_vol 2472; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2473; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2474; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2475; GFX9-NEXT: s_cbranch_execnz .LBB54_1 2476; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2477; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2478; GFX9-NEXT: v_mov_b32_e32 v0, v3 2479; GFX9-NEXT: s_setpc_b64 s[30:31] 2480 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 2481 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst 2482 ret i32 %result 2483} 2484 2485define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 2486; SI-LABEL: global_atomic_nand_i32_noret_scalar: 2487; SI: ; %bb.0: 2488; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2489; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2490; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 2491; SI-NEXT: s_mov_b64 exec, s[34:35] 2492; SI-NEXT: s_waitcnt expcnt(0) 2493; SI-NEXT: v_writelane_b32 v4, s6, 0 2494; SI-NEXT: v_writelane_b32 v4, s7, 1 2495; SI-NEXT: s_mov_b32 s34, s6 2496; SI-NEXT: s_mov_b32 s7, 0xf000 2497; SI-NEXT: s_mov_b32 s6, -1 2498; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2499; SI-NEXT: s_mov_b64 s[36:37], 0 2500; SI-NEXT: .LBB55_1: ; %atomicrmw.start 2501; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2502; SI-NEXT: s_waitcnt vmcnt(0) 2503; SI-NEXT: v_and_b32_e32 v0, s34, v1 2504; SI-NEXT: v_not_b32_e32 v0, v0 2505; SI-NEXT: s_waitcnt expcnt(0) 2506; SI-NEXT: v_mov_b32_e32 v3, v1 2507; SI-NEXT: v_mov_b32_e32 v2, v0 2508; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 2509; SI-NEXT: s_waitcnt vmcnt(0) 2510; SI-NEXT: buffer_wbinvl1 2511; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2512; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2513; SI-NEXT: v_mov_b32_e32 v1, v2 2514; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2515; SI-NEXT: s_cbranch_execnz .LBB55_1 2516; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2517; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2518; SI-NEXT: v_readlane_b32 s7, v4, 1 2519; SI-NEXT: v_readlane_b32 s6, v4, 0 2520; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2521; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 2522; SI-NEXT: s_mov_b64 exec, s[34:35] 2523; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2524; SI-NEXT: s_setpc_b64 s[30:31] 2525; 2526; VI-LABEL: global_atomic_nand_i32_noret_scalar: 2527; VI: ; %bb.0: 2528; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2529; VI-NEXT: v_mov_b32_e32 v0, s4 2530; VI-NEXT: v_mov_b32_e32 v1, s5 2531; VI-NEXT: flat_load_dword v3, v[0:1] 2532; VI-NEXT: s_mov_b64 s[34:35], 0 2533; VI-NEXT: .LBB55_1: ; %atomicrmw.start 2534; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2535; VI-NEXT: s_waitcnt vmcnt(0) 2536; VI-NEXT: v_and_b32_e32 v2, s6, v3 2537; VI-NEXT: v_not_b32_e32 v2, v2 2538; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2539; VI-NEXT: s_waitcnt vmcnt(0) 2540; VI-NEXT: buffer_wbinvl1_vol 2541; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2542; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2543; VI-NEXT: v_mov_b32_e32 v3, v2 2544; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2545; VI-NEXT: s_cbranch_execnz .LBB55_1 2546; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2547; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2548; VI-NEXT: s_setpc_b64 s[30:31] 2549; 2550; GFX9-LABEL: global_atomic_nand_i32_noret_scalar: 2551; GFX9: ; %bb.0: 2552; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2553; GFX9-NEXT: v_mov_b32_e32 v2, 0 2554; GFX9-NEXT: global_load_dword v1, v2, s[4:5] 2555; GFX9-NEXT: s_mov_b64 s[34:35], 0 2556; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start 2557; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2558; GFX9-NEXT: s_waitcnt vmcnt(0) 2559; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 2560; GFX9-NEXT: v_not_b32_e32 v0, v0 2561; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc 2562; GFX9-NEXT: s_waitcnt vmcnt(0) 2563; GFX9-NEXT: buffer_wbinvl1_vol 2564; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2565; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2566; GFX9-NEXT: v_mov_b32_e32 v1, v0 2567; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2568; GFX9-NEXT: s_cbranch_execnz .LBB55_1 2569; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2570; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2571; GFX9-NEXT: s_setpc_b64 s[30:31] 2572 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst 2573 ret void 2574} 2575 2576define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 2577; SI-LABEL: global_atomic_nand_i32_noret_offset_scalar: 2578; SI: ; %bb.0: 2579; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2580; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2581; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 2582; SI-NEXT: s_mov_b64 exec, s[34:35] 2583; SI-NEXT: s_waitcnt expcnt(0) 2584; SI-NEXT: v_writelane_b32 v4, s6, 0 2585; SI-NEXT: v_writelane_b32 v4, s7, 1 2586; SI-NEXT: s_mov_b32 s34, s6 2587; SI-NEXT: s_mov_b32 s7, 0xf000 2588; SI-NEXT: s_mov_b32 s6, -1 2589; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 2590; SI-NEXT: s_mov_b64 s[36:37], 0 2591; SI-NEXT: .LBB56_1: ; %atomicrmw.start 2592; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2593; SI-NEXT: s_waitcnt vmcnt(0) 2594; SI-NEXT: v_and_b32_e32 v0, s34, v1 2595; SI-NEXT: v_not_b32_e32 v0, v0 2596; SI-NEXT: s_waitcnt expcnt(0) 2597; SI-NEXT: v_mov_b32_e32 v3, v1 2598; SI-NEXT: v_mov_b32_e32 v2, v0 2599; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 2600; SI-NEXT: s_waitcnt vmcnt(0) 2601; SI-NEXT: buffer_wbinvl1 2602; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2603; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2604; SI-NEXT: v_mov_b32_e32 v1, v2 2605; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2606; SI-NEXT: s_cbranch_execnz .LBB56_1 2607; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2608; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2609; SI-NEXT: v_readlane_b32 s7, v4, 1 2610; SI-NEXT: v_readlane_b32 s6, v4, 0 2611; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2612; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 2613; SI-NEXT: s_mov_b64 exec, s[34:35] 2614; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2615; SI-NEXT: s_setpc_b64 s[30:31] 2616; 2617; VI-LABEL: global_atomic_nand_i32_noret_offset_scalar: 2618; VI: ; %bb.0: 2619; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2620; VI-NEXT: s_add_u32 s34, s4, 16 2621; VI-NEXT: s_addc_u32 s35, s5, 0 2622; VI-NEXT: v_mov_b32_e32 v0, s34 2623; VI-NEXT: v_mov_b32_e32 v1, s35 2624; VI-NEXT: flat_load_dword v3, v[0:1] 2625; VI-NEXT: s_mov_b64 s[34:35], 0 2626; VI-NEXT: .LBB56_1: ; %atomicrmw.start 2627; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2628; VI-NEXT: s_waitcnt vmcnt(0) 2629; VI-NEXT: v_and_b32_e32 v2, s6, v3 2630; VI-NEXT: v_not_b32_e32 v2, v2 2631; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2632; VI-NEXT: s_waitcnt vmcnt(0) 2633; VI-NEXT: buffer_wbinvl1_vol 2634; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2635; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2636; VI-NEXT: v_mov_b32_e32 v3, v2 2637; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2638; VI-NEXT: s_cbranch_execnz .LBB56_1 2639; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2640; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2641; VI-NEXT: s_setpc_b64 s[30:31] 2642; 2643; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar: 2644; GFX9: ; %bb.0: 2645; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2646; GFX9-NEXT: v_mov_b32_e32 v2, 0 2647; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 2648; GFX9-NEXT: s_mov_b64 s[34:35], 0 2649; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start 2650; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2651; GFX9-NEXT: s_waitcnt vmcnt(0) 2652; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 2653; GFX9-NEXT: v_not_b32_e32 v0, v0 2654; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc 2655; GFX9-NEXT: s_waitcnt vmcnt(0) 2656; GFX9-NEXT: buffer_wbinvl1_vol 2657; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2658; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2659; GFX9-NEXT: v_mov_b32_e32 v1, v0 2660; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2661; GFX9-NEXT: s_cbranch_execnz .LBB56_1 2662; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2663; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2664; GFX9-NEXT: s_setpc_b64 s[30:31] 2665 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 2666 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst 2667 ret void 2668} 2669 2670define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 2671; SI-LABEL: global_atomic_nand_i32_ret_scalar: 2672; SI: ; %bb.0: 2673; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2674; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2675; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 2676; SI-NEXT: s_mov_b64 exec, s[34:35] 2677; SI-NEXT: s_waitcnt expcnt(0) 2678; SI-NEXT: v_writelane_b32 v3, s6, 0 2679; SI-NEXT: v_writelane_b32 v3, s7, 1 2680; SI-NEXT: s_mov_b32 s34, s6 2681; SI-NEXT: s_mov_b32 s7, 0xf000 2682; SI-NEXT: s_mov_b32 s6, -1 2683; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 2684; SI-NEXT: s_mov_b64 s[36:37], 0 2685; SI-NEXT: .LBB57_1: ; %atomicrmw.start 2686; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2687; SI-NEXT: s_waitcnt vmcnt(0) 2688; SI-NEXT: v_mov_b32_e32 v2, v0 2689; SI-NEXT: s_waitcnt expcnt(0) 2690; SI-NEXT: v_and_b32_e32 v0, s34, v2 2691; SI-NEXT: v_not_b32_e32 v1, v0 2692; SI-NEXT: v_mov_b32_e32 v0, v1 2693; SI-NEXT: v_mov_b32_e32 v1, v2 2694; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc 2695; SI-NEXT: s_waitcnt vmcnt(0) 2696; SI-NEXT: buffer_wbinvl1 2697; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 2698; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2699; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2700; SI-NEXT: s_cbranch_execnz .LBB57_1 2701; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2702; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2703; SI-NEXT: v_readlane_b32 s7, v3, 1 2704; SI-NEXT: v_readlane_b32 s6, v3, 0 2705; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2706; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 2707; SI-NEXT: s_mov_b64 exec, s[34:35] 2708; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2709; SI-NEXT: s_setpc_b64 s[30:31] 2710; 2711; VI-LABEL: global_atomic_nand_i32_ret_scalar: 2712; VI: ; %bb.0: 2713; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2714; VI-NEXT: v_mov_b32_e32 v0, s4 2715; VI-NEXT: v_mov_b32_e32 v1, s5 2716; VI-NEXT: flat_load_dword v0, v[0:1] 2717; VI-NEXT: v_mov_b32_e32 v1, s4 2718; VI-NEXT: s_mov_b64 s[34:35], 0 2719; VI-NEXT: v_mov_b32_e32 v2, s5 2720; VI-NEXT: .LBB57_1: ; %atomicrmw.start 2721; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2722; VI-NEXT: s_waitcnt vmcnt(0) 2723; VI-NEXT: v_mov_b32_e32 v4, v0 2724; VI-NEXT: v_and_b32_e32 v0, s6, v4 2725; VI-NEXT: v_not_b32_e32 v3, v0 2726; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2727; VI-NEXT: s_waitcnt vmcnt(0) 2728; VI-NEXT: buffer_wbinvl1_vol 2729; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2730; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2731; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2732; VI-NEXT: s_cbranch_execnz .LBB57_1 2733; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2734; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2735; VI-NEXT: s_setpc_b64 s[30:31] 2736; 2737; GFX9-LABEL: global_atomic_nand_i32_ret_scalar: 2738; GFX9: ; %bb.0: 2739; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2740; GFX9-NEXT: v_mov_b32_e32 v1, 0 2741; GFX9-NEXT: global_load_dword v0, v1, s[4:5] 2742; GFX9-NEXT: s_mov_b64 s[34:35], 0 2743; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start 2744; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2745; GFX9-NEXT: s_waitcnt vmcnt(0) 2746; GFX9-NEXT: v_mov_b32_e32 v3, v0 2747; GFX9-NEXT: v_and_b32_e32 v0, s6, v3 2748; GFX9-NEXT: v_not_b32_e32 v2, v0 2749; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc 2750; GFX9-NEXT: s_waitcnt vmcnt(0) 2751; GFX9-NEXT: buffer_wbinvl1_vol 2752; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 2753; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2754; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2755; GFX9-NEXT: s_cbranch_execnz .LBB57_1 2756; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2757; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2758; GFX9-NEXT: s_setpc_b64 s[30:31] 2759 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst 2760 ret i32 %result 2761} 2762 2763define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 2764; SI-LABEL: global_atomic_nand_i32_ret_offset_scalar: 2765; SI: ; %bb.0: 2766; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2767; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2768; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 2769; SI-NEXT: s_mov_b64 exec, s[34:35] 2770; SI-NEXT: s_waitcnt expcnt(0) 2771; SI-NEXT: v_writelane_b32 v3, s6, 0 2772; SI-NEXT: v_writelane_b32 v3, s7, 1 2773; SI-NEXT: s_mov_b32 s34, s6 2774; SI-NEXT: s_mov_b32 s7, 0xf000 2775; SI-NEXT: s_mov_b32 s6, -1 2776; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 2777; SI-NEXT: s_mov_b64 s[36:37], 0 2778; SI-NEXT: .LBB58_1: ; %atomicrmw.start 2779; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2780; SI-NEXT: s_waitcnt vmcnt(0) 2781; SI-NEXT: v_mov_b32_e32 v2, v0 2782; SI-NEXT: s_waitcnt expcnt(0) 2783; SI-NEXT: v_and_b32_e32 v0, s34, v2 2784; SI-NEXT: v_not_b32_e32 v1, v0 2785; SI-NEXT: v_mov_b32_e32 v0, v1 2786; SI-NEXT: v_mov_b32_e32 v1, v2 2787; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2788; SI-NEXT: s_waitcnt vmcnt(0) 2789; SI-NEXT: buffer_wbinvl1 2790; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 2791; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2792; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2793; SI-NEXT: s_cbranch_execnz .LBB58_1 2794; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2795; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2796; SI-NEXT: v_readlane_b32 s7, v3, 1 2797; SI-NEXT: v_readlane_b32 s6, v3, 0 2798; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2799; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 2800; SI-NEXT: s_mov_b64 exec, s[34:35] 2801; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2802; SI-NEXT: s_setpc_b64 s[30:31] 2803; 2804; VI-LABEL: global_atomic_nand_i32_ret_offset_scalar: 2805; VI: ; %bb.0: 2806; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2807; VI-NEXT: s_add_u32 s34, s4, 16 2808; VI-NEXT: s_addc_u32 s35, s5, 0 2809; VI-NEXT: v_mov_b32_e32 v1, s34 2810; VI-NEXT: v_mov_b32_e32 v2, s35 2811; VI-NEXT: flat_load_dword v0, v[1:2] 2812; VI-NEXT: s_mov_b64 s[34:35], 0 2813; VI-NEXT: .LBB58_1: ; %atomicrmw.start 2814; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2815; VI-NEXT: s_waitcnt vmcnt(0) 2816; VI-NEXT: v_mov_b32_e32 v4, v0 2817; VI-NEXT: v_and_b32_e32 v0, s6, v4 2818; VI-NEXT: v_not_b32_e32 v3, v0 2819; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2820; VI-NEXT: s_waitcnt vmcnt(0) 2821; VI-NEXT: buffer_wbinvl1_vol 2822; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2823; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2824; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2825; VI-NEXT: s_cbranch_execnz .LBB58_1 2826; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2827; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2828; VI-NEXT: s_setpc_b64 s[30:31] 2829; 2830; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar: 2831; GFX9: ; %bb.0: 2832; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2833; GFX9-NEXT: v_mov_b32_e32 v1, 0 2834; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 2835; GFX9-NEXT: s_mov_b64 s[34:35], 0 2836; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start 2837; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2838; GFX9-NEXT: s_waitcnt vmcnt(0) 2839; GFX9-NEXT: v_mov_b32_e32 v3, v0 2840; GFX9-NEXT: v_and_b32_e32 v0, s6, v3 2841; GFX9-NEXT: v_not_b32_e32 v2, v0 2842; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc 2843; GFX9-NEXT: s_waitcnt vmcnt(0) 2844; GFX9-NEXT: buffer_wbinvl1_vol 2845; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 2846; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2847; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2848; GFX9-NEXT: s_cbranch_execnz .LBB58_1 2849; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2850; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2851; GFX9-NEXT: s_setpc_b64 s[30:31] 2852 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 2853 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst 2854 ret i32 %result 2855} 2856 2857define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 2858; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: 2859; SI: ; %bb.0: 2860; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2861; SI-NEXT: s_mov_b32 s6, 0 2862; SI-NEXT: s_mov_b32 s7, 0xf000 2863; SI-NEXT: s_mov_b32 s4, s6 2864; SI-NEXT: s_mov_b32 s5, s6 2865; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 2866; SI-NEXT: s_mov_b64 s[8:9], 0 2867; SI-NEXT: .LBB59_1: ; %atomicrmw.start 2868; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2869; SI-NEXT: s_waitcnt vmcnt(0) 2870; SI-NEXT: v_and_b32_e32 v3, v4, v2 2871; SI-NEXT: v_not_b32_e32 v3, v3 2872; SI-NEXT: s_waitcnt expcnt(0) 2873; SI-NEXT: v_mov_b32_e32 v6, v4 2874; SI-NEXT: v_mov_b32_e32 v5, v3 2875; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 2876; SI-NEXT: s_waitcnt vmcnt(0) 2877; SI-NEXT: buffer_wbinvl1 2878; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 2879; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2880; SI-NEXT: v_mov_b32_e32 v4, v5 2881; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2882; SI-NEXT: s_cbranch_execnz .LBB59_1 2883; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2884; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2885; SI-NEXT: s_waitcnt expcnt(0) 2886; SI-NEXT: s_setpc_b64 s[30:31] 2887; 2888; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: 2889; VI: ; %bb.0: 2890; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2891; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2892; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2893; VI-NEXT: flat_load_dword v4, v[0:1] 2894; VI-NEXT: s_mov_b64 s[4:5], 0 2895; VI-NEXT: .LBB59_1: ; %atomicrmw.start 2896; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2897; VI-NEXT: s_waitcnt vmcnt(0) 2898; VI-NEXT: v_and_b32_e32 v3, v4, v2 2899; VI-NEXT: v_not_b32_e32 v3, v3 2900; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2901; VI-NEXT: s_waitcnt vmcnt(0) 2902; VI-NEXT: buffer_wbinvl1_vol 2903; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2904; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2905; VI-NEXT: v_mov_b32_e32 v4, v3 2906; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2907; VI-NEXT: s_cbranch_execnz .LBB59_1 2908; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2909; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2910; VI-NEXT: s_setpc_b64 s[30:31] 2911; 2912; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: 2913; GFX9: ; %bb.0: 2914; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2915; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 2916; GFX9-NEXT: s_mov_b64 s[4:5], 0 2917; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start 2918; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2919; GFX9-NEXT: s_waitcnt vmcnt(0) 2920; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 2921; GFX9-NEXT: v_not_b32_e32 v3, v3 2922; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 2923; GFX9-NEXT: s_waitcnt vmcnt(0) 2924; GFX9-NEXT: buffer_wbinvl1_vol 2925; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2926; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2927; GFX9-NEXT: v_mov_b32_e32 v4, v3 2928; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2929; GFX9-NEXT: s_cbranch_execnz .LBB59_1 2930; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2931; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2932; GFX9-NEXT: s_setpc_b64 s[30:31] 2933 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 2934 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2935 ret void 2936} 2937 2938define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 2939; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: 2940; SI: ; %bb.0: 2941; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2942; SI-NEXT: s_mov_b32 s6, 0 2943; SI-NEXT: s_mov_b32 s7, 0xf000 2944; SI-NEXT: s_mov_b32 s4, s6 2945; SI-NEXT: s_mov_b32 s5, s6 2946; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 2947; SI-NEXT: s_mov_b64 s[8:9], 0 2948; SI-NEXT: .LBB60_1: ; %atomicrmw.start 2949; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2950; SI-NEXT: s_waitcnt vmcnt(0) 2951; SI-NEXT: v_mov_b32_e32 v5, v3 2952; SI-NEXT: s_waitcnt expcnt(0) 2953; SI-NEXT: v_and_b32_e32 v3, v5, v2 2954; SI-NEXT: v_not_b32_e32 v4, v3 2955; SI-NEXT: v_mov_b32_e32 v3, v4 2956; SI-NEXT: v_mov_b32_e32 v4, v5 2957; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 2958; SI-NEXT: s_waitcnt vmcnt(0) 2959; SI-NEXT: buffer_wbinvl1 2960; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2961; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2962; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2963; SI-NEXT: s_cbranch_execnz .LBB60_1 2964; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2965; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2966; SI-NEXT: v_mov_b32_e32 v0, v3 2967; SI-NEXT: s_waitcnt expcnt(0) 2968; SI-NEXT: s_setpc_b64 s[30:31] 2969; 2970; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: 2971; VI: ; %bb.0: 2972; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2973; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 2974; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2975; VI-NEXT: flat_load_dword v0, v[3:4] 2976; VI-NEXT: s_mov_b64 s[4:5], 0 2977; VI-NEXT: .LBB60_1: ; %atomicrmw.start 2978; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2979; VI-NEXT: s_waitcnt vmcnt(0) 2980; VI-NEXT: v_mov_b32_e32 v1, v0 2981; VI-NEXT: v_and_b32_e32 v0, v1, v2 2982; VI-NEXT: v_not_b32_e32 v0, v0 2983; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2984; VI-NEXT: s_waitcnt vmcnt(0) 2985; VI-NEXT: buffer_wbinvl1_vol 2986; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2987; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2988; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2989; VI-NEXT: s_cbranch_execnz .LBB60_1 2990; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2991; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2992; VI-NEXT: s_setpc_b64 s[30:31] 2993; 2994; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: 2995; GFX9: ; %bb.0: 2996; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2997; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 2998; GFX9-NEXT: s_mov_b64 s[4:5], 0 2999; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start 3000; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3001; GFX9-NEXT: s_waitcnt vmcnt(0) 3002; GFX9-NEXT: v_mov_b32_e32 v4, v3 3003; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 3004; GFX9-NEXT: v_not_b32_e32 v3, v3 3005; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 3006; GFX9-NEXT: s_waitcnt vmcnt(0) 3007; GFX9-NEXT: buffer_wbinvl1_vol 3008; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3009; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3010; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 3011; GFX9-NEXT: s_cbranch_execnz .LBB60_1 3012; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3013; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3014; GFX9-NEXT: v_mov_b32_e32 v0, v3 3015; GFX9-NEXT: s_setpc_b64 s[30:31] 3016 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 3017 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3018 ret i32 %result 3019} 3020 3021; --------------------------------------------------------------------- 3022; atomicrmw or 3023; --------------------------------------------------------------------- 3024 3025define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 3026; SI-LABEL: global_atomic_or_i32_noret: 3027; SI: ; %bb.0: 3028; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3029; SI-NEXT: s_mov_b32 s6, 0 3030; SI-NEXT: s_mov_b32 s7, 0xf000 3031; SI-NEXT: s_mov_b32 s4, s6 3032; SI-NEXT: s_mov_b32 s5, s6 3033; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 3034; SI-NEXT: s_waitcnt vmcnt(0) 3035; SI-NEXT: buffer_wbinvl1 3036; SI-NEXT: s_waitcnt expcnt(0) 3037; SI-NEXT: s_setpc_b64 s[30:31] 3038; 3039; VI-LABEL: global_atomic_or_i32_noret: 3040; VI: ; %bb.0: 3041; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3042; VI-NEXT: flat_atomic_or v[0:1], v2 3043; VI-NEXT: s_waitcnt vmcnt(0) 3044; VI-NEXT: buffer_wbinvl1_vol 3045; VI-NEXT: s_setpc_b64 s[30:31] 3046; 3047; GFX9-LABEL: global_atomic_or_i32_noret: 3048; GFX9: ; %bb.0: 3049; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3050; GFX9-NEXT: global_atomic_or v[0:1], v2, off 3051; GFX9-NEXT: s_waitcnt vmcnt(0) 3052; GFX9-NEXT: buffer_wbinvl1_vol 3053; GFX9-NEXT: s_setpc_b64 s[30:31] 3054 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst 3055 ret void 3056} 3057 3058define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 3059; SI-LABEL: global_atomic_or_i32_noret_offset: 3060; SI: ; %bb.0: 3061; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3062; SI-NEXT: s_mov_b32 s6, 0 3063; SI-NEXT: s_mov_b32 s7, 0xf000 3064; SI-NEXT: s_mov_b32 s4, s6 3065; SI-NEXT: s_mov_b32 s5, s6 3066; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 3067; SI-NEXT: s_waitcnt vmcnt(0) 3068; SI-NEXT: buffer_wbinvl1 3069; SI-NEXT: s_waitcnt expcnt(0) 3070; SI-NEXT: s_setpc_b64 s[30:31] 3071; 3072; VI-LABEL: global_atomic_or_i32_noret_offset: 3073; VI: ; %bb.0: 3074; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3075; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3076; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3077; VI-NEXT: flat_atomic_or v[0:1], v2 3078; VI-NEXT: s_waitcnt vmcnt(0) 3079; VI-NEXT: buffer_wbinvl1_vol 3080; VI-NEXT: s_setpc_b64 s[30:31] 3081; 3082; GFX9-LABEL: global_atomic_or_i32_noret_offset: 3083; GFX9: ; %bb.0: 3084; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3085; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 3086; GFX9-NEXT: s_waitcnt vmcnt(0) 3087; GFX9-NEXT: buffer_wbinvl1_vol 3088; GFX9-NEXT: s_setpc_b64 s[30:31] 3089 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3090 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst 3091 ret void 3092} 3093 3094define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 3095; SI-LABEL: global_atomic_or_i32_ret: 3096; SI: ; %bb.0: 3097; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3098; SI-NEXT: s_mov_b32 s6, 0 3099; SI-NEXT: s_mov_b32 s7, 0xf000 3100; SI-NEXT: s_mov_b32 s4, s6 3101; SI-NEXT: s_mov_b32 s5, s6 3102; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc 3103; SI-NEXT: s_waitcnt vmcnt(0) 3104; SI-NEXT: buffer_wbinvl1 3105; SI-NEXT: v_mov_b32_e32 v0, v2 3106; SI-NEXT: s_waitcnt expcnt(0) 3107; SI-NEXT: s_setpc_b64 s[30:31] 3108; 3109; VI-LABEL: global_atomic_or_i32_ret: 3110; VI: ; %bb.0: 3111; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3112; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc 3113; VI-NEXT: s_waitcnt vmcnt(0) 3114; VI-NEXT: buffer_wbinvl1_vol 3115; VI-NEXT: s_setpc_b64 s[30:31] 3116; 3117; GFX9-LABEL: global_atomic_or_i32_ret: 3118; GFX9: ; %bb.0: 3119; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3120; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc 3121; GFX9-NEXT: s_waitcnt vmcnt(0) 3122; GFX9-NEXT: buffer_wbinvl1_vol 3123; GFX9-NEXT: s_setpc_b64 s[30:31] 3124 %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst 3125 ret i32 %result 3126} 3127 3128define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 3129; SI-LABEL: global_atomic_or_i32_ret_offset: 3130; SI: ; %bb.0: 3131; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3132; SI-NEXT: s_mov_b32 s6, 0 3133; SI-NEXT: s_mov_b32 s7, 0xf000 3134; SI-NEXT: s_mov_b32 s4, s6 3135; SI-NEXT: s_mov_b32 s5, s6 3136; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 3137; SI-NEXT: s_waitcnt vmcnt(0) 3138; SI-NEXT: buffer_wbinvl1 3139; SI-NEXT: v_mov_b32_e32 v0, v2 3140; SI-NEXT: s_waitcnt expcnt(0) 3141; SI-NEXT: s_setpc_b64 s[30:31] 3142; 3143; VI-LABEL: global_atomic_or_i32_ret_offset: 3144; VI: ; %bb.0: 3145; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3146; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3147; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3148; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc 3149; VI-NEXT: s_waitcnt vmcnt(0) 3150; VI-NEXT: buffer_wbinvl1_vol 3151; VI-NEXT: s_setpc_b64 s[30:31] 3152; 3153; GFX9-LABEL: global_atomic_or_i32_ret_offset: 3154; GFX9: ; %bb.0: 3155; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3156; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc 3157; GFX9-NEXT: s_waitcnt vmcnt(0) 3158; GFX9-NEXT: buffer_wbinvl1_vol 3159; GFX9-NEXT: s_setpc_b64 s[30:31] 3160 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3161 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst 3162 ret i32 %result 3163} 3164 3165define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 3166; SI-LABEL: global_atomic_or_i32_noret_scalar: 3167; SI: ; %bb.0: 3168; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3169; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3170; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3171; SI-NEXT: s_mov_b64 exec, s[34:35] 3172; SI-NEXT: s_waitcnt expcnt(0) 3173; SI-NEXT: v_writelane_b32 v1, s6, 0 3174; SI-NEXT: v_writelane_b32 v1, s7, 1 3175; SI-NEXT: s_mov_b32 s34, s6 3176; SI-NEXT: s_mov_b32 s7, 0xf000 3177; SI-NEXT: s_mov_b32 s6, -1 3178; SI-NEXT: v_mov_b32_e32 v0, s34 3179; SI-NEXT: s_waitcnt vmcnt(0) 3180; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 3181; SI-NEXT: s_waitcnt vmcnt(0) 3182; SI-NEXT: buffer_wbinvl1 3183; SI-NEXT: v_readlane_b32 s7, v1, 1 3184; SI-NEXT: v_readlane_b32 s6, v1, 0 3185; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3186; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3187; SI-NEXT: s_mov_b64 exec, s[34:35] 3188; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3189; SI-NEXT: s_setpc_b64 s[30:31] 3190; 3191; VI-LABEL: global_atomic_or_i32_noret_scalar: 3192; VI: ; %bb.0: 3193; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3194; VI-NEXT: v_mov_b32_e32 v0, s4 3195; VI-NEXT: v_mov_b32_e32 v1, s5 3196; VI-NEXT: v_mov_b32_e32 v2, s6 3197; VI-NEXT: flat_atomic_or v[0:1], v2 3198; VI-NEXT: s_waitcnt vmcnt(0) 3199; VI-NEXT: buffer_wbinvl1_vol 3200; VI-NEXT: s_setpc_b64 s[30:31] 3201; 3202; GFX9-LABEL: global_atomic_or_i32_noret_scalar: 3203; GFX9: ; %bb.0: 3204; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3205; GFX9-NEXT: v_mov_b32_e32 v0, 0 3206; GFX9-NEXT: v_mov_b32_e32 v1, s6 3207; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] 3208; GFX9-NEXT: s_waitcnt vmcnt(0) 3209; GFX9-NEXT: buffer_wbinvl1_vol 3210; GFX9-NEXT: s_setpc_b64 s[30:31] 3211 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst 3212 ret void 3213} 3214 3215define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 3216; SI-LABEL: global_atomic_or_i32_noret_offset_scalar: 3217; SI: ; %bb.0: 3218; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3219; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3220; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3221; SI-NEXT: s_mov_b64 exec, s[34:35] 3222; SI-NEXT: s_waitcnt expcnt(0) 3223; SI-NEXT: v_writelane_b32 v1, s6, 0 3224; SI-NEXT: v_writelane_b32 v1, s7, 1 3225; SI-NEXT: s_mov_b32 s34, s6 3226; SI-NEXT: s_mov_b32 s7, 0xf000 3227; SI-NEXT: s_mov_b32 s6, -1 3228; SI-NEXT: v_mov_b32_e32 v0, s34 3229; SI-NEXT: s_waitcnt vmcnt(0) 3230; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 3231; SI-NEXT: s_waitcnt vmcnt(0) 3232; SI-NEXT: buffer_wbinvl1 3233; SI-NEXT: v_readlane_b32 s7, v1, 1 3234; SI-NEXT: v_readlane_b32 s6, v1, 0 3235; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3236; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3237; SI-NEXT: s_mov_b64 exec, s[34:35] 3238; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3239; SI-NEXT: s_setpc_b64 s[30:31] 3240; 3241; VI-LABEL: global_atomic_or_i32_noret_offset_scalar: 3242; VI: ; %bb.0: 3243; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3244; VI-NEXT: s_add_u32 s34, s4, 16 3245; VI-NEXT: s_addc_u32 s35, s5, 0 3246; VI-NEXT: v_mov_b32_e32 v0, s34 3247; VI-NEXT: v_mov_b32_e32 v1, s35 3248; VI-NEXT: v_mov_b32_e32 v2, s6 3249; VI-NEXT: flat_atomic_or v[0:1], v2 3250; VI-NEXT: s_waitcnt vmcnt(0) 3251; VI-NEXT: buffer_wbinvl1_vol 3252; VI-NEXT: s_setpc_b64 s[30:31] 3253; 3254; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar: 3255; GFX9: ; %bb.0: 3256; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3257; GFX9-NEXT: v_mov_b32_e32 v0, 0 3258; GFX9-NEXT: v_mov_b32_e32 v1, s6 3259; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16 3260; GFX9-NEXT: s_waitcnt vmcnt(0) 3261; GFX9-NEXT: buffer_wbinvl1_vol 3262; GFX9-NEXT: s_setpc_b64 s[30:31] 3263 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3264 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst 3265 ret void 3266} 3267 3268define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 3269; SI-LABEL: global_atomic_or_i32_ret_scalar: 3270; SI: ; %bb.0: 3271; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3272; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3273; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3274; SI-NEXT: s_mov_b64 exec, s[34:35] 3275; SI-NEXT: s_waitcnt expcnt(0) 3276; SI-NEXT: v_writelane_b32 v1, s6, 0 3277; SI-NEXT: v_writelane_b32 v1, s7, 1 3278; SI-NEXT: s_mov_b32 s34, s6 3279; SI-NEXT: s_mov_b32 s7, 0xf000 3280; SI-NEXT: s_mov_b32 s6, -1 3281; SI-NEXT: v_mov_b32_e32 v0, s34 3282; SI-NEXT: s_waitcnt vmcnt(0) 3283; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc 3284; SI-NEXT: s_waitcnt vmcnt(0) 3285; SI-NEXT: buffer_wbinvl1 3286; SI-NEXT: v_readlane_b32 s7, v1, 1 3287; SI-NEXT: v_readlane_b32 s6, v1, 0 3288; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3289; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3290; SI-NEXT: s_mov_b64 exec, s[34:35] 3291; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3292; SI-NEXT: s_setpc_b64 s[30:31] 3293; 3294; VI-LABEL: global_atomic_or_i32_ret_scalar: 3295; VI: ; %bb.0: 3296; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3297; VI-NEXT: v_mov_b32_e32 v0, s4 3298; VI-NEXT: v_mov_b32_e32 v1, s5 3299; VI-NEXT: v_mov_b32_e32 v2, s6 3300; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc 3301; VI-NEXT: s_waitcnt vmcnt(0) 3302; VI-NEXT: buffer_wbinvl1_vol 3303; VI-NEXT: s_setpc_b64 s[30:31] 3304; 3305; GFX9-LABEL: global_atomic_or_i32_ret_scalar: 3306; GFX9: ; %bb.0: 3307; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3308; GFX9-NEXT: v_mov_b32_e32 v0, 0 3309; GFX9-NEXT: v_mov_b32_e32 v1, s6 3310; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc 3311; GFX9-NEXT: s_waitcnt vmcnt(0) 3312; GFX9-NEXT: buffer_wbinvl1_vol 3313; GFX9-NEXT: s_setpc_b64 s[30:31] 3314 %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst 3315 ret i32 %result 3316} 3317 3318define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 3319; SI-LABEL: global_atomic_or_i32_ret_offset_scalar: 3320; SI: ; %bb.0: 3321; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3322; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3323; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3324; SI-NEXT: s_mov_b64 exec, s[34:35] 3325; SI-NEXT: s_waitcnt expcnt(0) 3326; SI-NEXT: v_writelane_b32 v1, s6, 0 3327; SI-NEXT: v_writelane_b32 v1, s7, 1 3328; SI-NEXT: s_mov_b32 s34, s6 3329; SI-NEXT: s_mov_b32 s7, 0xf000 3330; SI-NEXT: s_mov_b32 s6, -1 3331; SI-NEXT: v_mov_b32_e32 v0, s34 3332; SI-NEXT: s_waitcnt vmcnt(0) 3333; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc 3334; SI-NEXT: s_waitcnt vmcnt(0) 3335; SI-NEXT: buffer_wbinvl1 3336; SI-NEXT: v_readlane_b32 s7, v1, 1 3337; SI-NEXT: v_readlane_b32 s6, v1, 0 3338; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3339; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3340; SI-NEXT: s_mov_b64 exec, s[34:35] 3341; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3342; SI-NEXT: s_setpc_b64 s[30:31] 3343; 3344; VI-LABEL: global_atomic_or_i32_ret_offset_scalar: 3345; VI: ; %bb.0: 3346; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3347; VI-NEXT: s_add_u32 s34, s4, 16 3348; VI-NEXT: s_addc_u32 s35, s5, 0 3349; VI-NEXT: v_mov_b32_e32 v0, s34 3350; VI-NEXT: v_mov_b32_e32 v1, s35 3351; VI-NEXT: v_mov_b32_e32 v2, s6 3352; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc 3353; VI-NEXT: s_waitcnt vmcnt(0) 3354; VI-NEXT: buffer_wbinvl1_vol 3355; VI-NEXT: s_setpc_b64 s[30:31] 3356; 3357; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar: 3358; GFX9: ; %bb.0: 3359; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3360; GFX9-NEXT: v_mov_b32_e32 v0, 0 3361; GFX9-NEXT: v_mov_b32_e32 v1, s6 3362; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc 3363; GFX9-NEXT: s_waitcnt vmcnt(0) 3364; GFX9-NEXT: buffer_wbinvl1_vol 3365; GFX9-NEXT: s_setpc_b64 s[30:31] 3366 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3367 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst 3368 ret i32 %result 3369} 3370 3371define i32 @global_atomic_or_0_i32_ret(ptr addrspace(1) %ptr) { 3372; SI-LABEL: global_atomic_or_0_i32_ret: 3373; SI: ; %bb.0: 3374; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3375; SI-NEXT: s_mov_b32 s7, 0xf000 3376; SI-NEXT: s_mov_b32 s6, 0 3377; SI-NEXT: v_mov_b32_e32 v2, 0 3378; SI-NEXT: s_mov_b32 s4, s6 3379; SI-NEXT: s_mov_b32 s5, s6 3380; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc 3381; SI-NEXT: s_waitcnt vmcnt(0) 3382; SI-NEXT: buffer_wbinvl1 3383; SI-NEXT: v_mov_b32_e32 v0, v2 3384; SI-NEXT: s_waitcnt expcnt(0) 3385; SI-NEXT: s_setpc_b64 s[30:31] 3386; 3387; VI-LABEL: global_atomic_or_0_i32_ret: 3388; VI: ; %bb.0: 3389; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3390; VI-NEXT: v_mov_b32_e32 v2, 0 3391; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 3392; VI-NEXT: s_waitcnt vmcnt(0) 3393; VI-NEXT: buffer_wbinvl1_vol 3394; VI-NEXT: s_setpc_b64 s[30:31] 3395; 3396; GFX9-LABEL: global_atomic_or_0_i32_ret: 3397; GFX9: ; %bb.0: 3398; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3399; GFX9-NEXT: v_mov_b32_e32 v2, 0 3400; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc 3401; GFX9-NEXT: s_waitcnt vmcnt(0) 3402; GFX9-NEXT: buffer_wbinvl1_vol 3403; GFX9-NEXT: s_setpc_b64 s[30:31] 3404 %result = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst 3405 ret i32 %result 3406} 3407 3408define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 3409; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: 3410; SI: ; %bb.0: 3411; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3412; SI-NEXT: s_mov_b32 s6, 0 3413; SI-NEXT: s_mov_b32 s7, 0xf000 3414; SI-NEXT: s_mov_b32 s4, s6 3415; SI-NEXT: s_mov_b32 s5, s6 3416; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 3417; SI-NEXT: s_waitcnt vmcnt(0) 3418; SI-NEXT: buffer_wbinvl1 3419; SI-NEXT: s_waitcnt expcnt(0) 3420; SI-NEXT: s_setpc_b64 s[30:31] 3421; 3422; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: 3423; VI: ; %bb.0: 3424; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3425; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3426; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3427; VI-NEXT: flat_atomic_or v[0:1], v2 3428; VI-NEXT: s_waitcnt vmcnt(0) 3429; VI-NEXT: buffer_wbinvl1_vol 3430; VI-NEXT: s_setpc_b64 s[30:31] 3431; 3432; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: 3433; GFX9: ; %bb.0: 3434; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3435; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 3436; GFX9-NEXT: s_waitcnt vmcnt(0) 3437; GFX9-NEXT: buffer_wbinvl1_vol 3438; GFX9-NEXT: s_setpc_b64 s[30:31] 3439 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 3440 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3441 ret void 3442} 3443 3444define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 3445; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: 3446; SI: ; %bb.0: 3447; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3448; SI-NEXT: s_mov_b32 s6, 0 3449; SI-NEXT: s_mov_b32 s7, 0xf000 3450; SI-NEXT: s_mov_b32 s4, s6 3451; SI-NEXT: s_mov_b32 s5, s6 3452; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 3453; SI-NEXT: s_waitcnt vmcnt(0) 3454; SI-NEXT: buffer_wbinvl1 3455; SI-NEXT: v_mov_b32_e32 v0, v2 3456; SI-NEXT: s_waitcnt expcnt(0) 3457; SI-NEXT: s_setpc_b64 s[30:31] 3458; 3459; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: 3460; VI: ; %bb.0: 3461; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3462; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3463; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3464; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc 3465; VI-NEXT: s_waitcnt vmcnt(0) 3466; VI-NEXT: buffer_wbinvl1_vol 3467; VI-NEXT: s_setpc_b64 s[30:31] 3468; 3469; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: 3470; GFX9: ; %bb.0: 3471; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3472; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc 3473; GFX9-NEXT: s_waitcnt vmcnt(0) 3474; GFX9-NEXT: buffer_wbinvl1_vol 3475; GFX9-NEXT: s_setpc_b64 s[30:31] 3476 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 3477 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3478 ret i32 %result 3479} 3480 3481; --------------------------------------------------------------------- 3482; atomicrmw xor 3483; --------------------------------------------------------------------- 3484 3485define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 3486; SI-LABEL: global_atomic_xor_i32_noret: 3487; SI: ; %bb.0: 3488; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3489; SI-NEXT: s_mov_b32 s6, 0 3490; SI-NEXT: s_mov_b32 s7, 0xf000 3491; SI-NEXT: s_mov_b32 s4, s6 3492; SI-NEXT: s_mov_b32 s5, s6 3493; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 3494; SI-NEXT: s_waitcnt vmcnt(0) 3495; SI-NEXT: buffer_wbinvl1 3496; SI-NEXT: s_waitcnt expcnt(0) 3497; SI-NEXT: s_setpc_b64 s[30:31] 3498; 3499; VI-LABEL: global_atomic_xor_i32_noret: 3500; VI: ; %bb.0: 3501; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3502; VI-NEXT: flat_atomic_xor v[0:1], v2 3503; VI-NEXT: s_waitcnt vmcnt(0) 3504; VI-NEXT: buffer_wbinvl1_vol 3505; VI-NEXT: s_setpc_b64 s[30:31] 3506; 3507; GFX9-LABEL: global_atomic_xor_i32_noret: 3508; GFX9: ; %bb.0: 3509; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3510; GFX9-NEXT: global_atomic_xor v[0:1], v2, off 3511; GFX9-NEXT: s_waitcnt vmcnt(0) 3512; GFX9-NEXT: buffer_wbinvl1_vol 3513; GFX9-NEXT: s_setpc_b64 s[30:31] 3514 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst 3515 ret void 3516} 3517 3518define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 3519; SI-LABEL: global_atomic_xor_i32_noret_offset: 3520; SI: ; %bb.0: 3521; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3522; SI-NEXT: s_mov_b32 s6, 0 3523; SI-NEXT: s_mov_b32 s7, 0xf000 3524; SI-NEXT: s_mov_b32 s4, s6 3525; SI-NEXT: s_mov_b32 s5, s6 3526; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 3527; SI-NEXT: s_waitcnt vmcnt(0) 3528; SI-NEXT: buffer_wbinvl1 3529; SI-NEXT: s_waitcnt expcnt(0) 3530; SI-NEXT: s_setpc_b64 s[30:31] 3531; 3532; VI-LABEL: global_atomic_xor_i32_noret_offset: 3533; VI: ; %bb.0: 3534; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3535; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3536; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3537; VI-NEXT: flat_atomic_xor v[0:1], v2 3538; VI-NEXT: s_waitcnt vmcnt(0) 3539; VI-NEXT: buffer_wbinvl1_vol 3540; VI-NEXT: s_setpc_b64 s[30:31] 3541; 3542; GFX9-LABEL: global_atomic_xor_i32_noret_offset: 3543; GFX9: ; %bb.0: 3544; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3545; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 3546; GFX9-NEXT: s_waitcnt vmcnt(0) 3547; GFX9-NEXT: buffer_wbinvl1_vol 3548; GFX9-NEXT: s_setpc_b64 s[30:31] 3549 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3550 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst 3551 ret void 3552} 3553 3554define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 3555; SI-LABEL: global_atomic_xor_i32_ret: 3556; SI: ; %bb.0: 3557; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3558; SI-NEXT: s_mov_b32 s6, 0 3559; SI-NEXT: s_mov_b32 s7, 0xf000 3560; SI-NEXT: s_mov_b32 s4, s6 3561; SI-NEXT: s_mov_b32 s5, s6 3562; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc 3563; SI-NEXT: s_waitcnt vmcnt(0) 3564; SI-NEXT: buffer_wbinvl1 3565; SI-NEXT: v_mov_b32_e32 v0, v2 3566; SI-NEXT: s_waitcnt expcnt(0) 3567; SI-NEXT: s_setpc_b64 s[30:31] 3568; 3569; VI-LABEL: global_atomic_xor_i32_ret: 3570; VI: ; %bb.0: 3571; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3572; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3573; VI-NEXT: s_waitcnt vmcnt(0) 3574; VI-NEXT: buffer_wbinvl1_vol 3575; VI-NEXT: s_setpc_b64 s[30:31] 3576; 3577; GFX9-LABEL: global_atomic_xor_i32_ret: 3578; GFX9: ; %bb.0: 3579; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3580; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc 3581; GFX9-NEXT: s_waitcnt vmcnt(0) 3582; GFX9-NEXT: buffer_wbinvl1_vol 3583; GFX9-NEXT: s_setpc_b64 s[30:31] 3584 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst 3585 ret i32 %result 3586} 3587 3588define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 3589; SI-LABEL: global_atomic_xor_i32_ret_offset: 3590; SI: ; %bb.0: 3591; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3592; SI-NEXT: s_mov_b32 s6, 0 3593; SI-NEXT: s_mov_b32 s7, 0xf000 3594; SI-NEXT: s_mov_b32 s4, s6 3595; SI-NEXT: s_mov_b32 s5, s6 3596; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 3597; SI-NEXT: s_waitcnt vmcnt(0) 3598; SI-NEXT: buffer_wbinvl1 3599; SI-NEXT: v_mov_b32_e32 v0, v2 3600; SI-NEXT: s_waitcnt expcnt(0) 3601; SI-NEXT: s_setpc_b64 s[30:31] 3602; 3603; VI-LABEL: global_atomic_xor_i32_ret_offset: 3604; VI: ; %bb.0: 3605; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3606; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3607; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3608; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3609; VI-NEXT: s_waitcnt vmcnt(0) 3610; VI-NEXT: buffer_wbinvl1_vol 3611; VI-NEXT: s_setpc_b64 s[30:31] 3612; 3613; GFX9-LABEL: global_atomic_xor_i32_ret_offset: 3614; GFX9: ; %bb.0: 3615; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3616; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc 3617; GFX9-NEXT: s_waitcnt vmcnt(0) 3618; GFX9-NEXT: buffer_wbinvl1_vol 3619; GFX9-NEXT: s_setpc_b64 s[30:31] 3620 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3621 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst 3622 ret i32 %result 3623} 3624 3625define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 3626; SI-LABEL: global_atomic_xor_i32_noret_scalar: 3627; SI: ; %bb.0: 3628; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3629; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3630; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3631; SI-NEXT: s_mov_b64 exec, s[34:35] 3632; SI-NEXT: s_waitcnt expcnt(0) 3633; SI-NEXT: v_writelane_b32 v1, s6, 0 3634; SI-NEXT: v_writelane_b32 v1, s7, 1 3635; SI-NEXT: s_mov_b32 s34, s6 3636; SI-NEXT: s_mov_b32 s7, 0xf000 3637; SI-NEXT: s_mov_b32 s6, -1 3638; SI-NEXT: v_mov_b32_e32 v0, s34 3639; SI-NEXT: s_waitcnt vmcnt(0) 3640; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 3641; SI-NEXT: s_waitcnt vmcnt(0) 3642; SI-NEXT: buffer_wbinvl1 3643; SI-NEXT: v_readlane_b32 s7, v1, 1 3644; SI-NEXT: v_readlane_b32 s6, v1, 0 3645; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3646; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3647; SI-NEXT: s_mov_b64 exec, s[34:35] 3648; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3649; SI-NEXT: s_setpc_b64 s[30:31] 3650; 3651; VI-LABEL: global_atomic_xor_i32_noret_scalar: 3652; VI: ; %bb.0: 3653; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3654; VI-NEXT: v_mov_b32_e32 v0, s4 3655; VI-NEXT: v_mov_b32_e32 v1, s5 3656; VI-NEXT: v_mov_b32_e32 v2, s6 3657; VI-NEXT: flat_atomic_xor v[0:1], v2 3658; VI-NEXT: s_waitcnt vmcnt(0) 3659; VI-NEXT: buffer_wbinvl1_vol 3660; VI-NEXT: s_setpc_b64 s[30:31] 3661; 3662; GFX9-LABEL: global_atomic_xor_i32_noret_scalar: 3663; GFX9: ; %bb.0: 3664; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3665; GFX9-NEXT: v_mov_b32_e32 v0, 0 3666; GFX9-NEXT: v_mov_b32_e32 v1, s6 3667; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] 3668; GFX9-NEXT: s_waitcnt vmcnt(0) 3669; GFX9-NEXT: buffer_wbinvl1_vol 3670; GFX9-NEXT: s_setpc_b64 s[30:31] 3671 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst 3672 ret void 3673} 3674 3675define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 3676; SI-LABEL: global_atomic_xor_i32_noret_offset_scalar: 3677; SI: ; %bb.0: 3678; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3679; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3680; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3681; SI-NEXT: s_mov_b64 exec, s[34:35] 3682; SI-NEXT: s_waitcnt expcnt(0) 3683; SI-NEXT: v_writelane_b32 v1, s6, 0 3684; SI-NEXT: v_writelane_b32 v1, s7, 1 3685; SI-NEXT: s_mov_b32 s34, s6 3686; SI-NEXT: s_mov_b32 s7, 0xf000 3687; SI-NEXT: s_mov_b32 s6, -1 3688; SI-NEXT: v_mov_b32_e32 v0, s34 3689; SI-NEXT: s_waitcnt vmcnt(0) 3690; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 3691; SI-NEXT: s_waitcnt vmcnt(0) 3692; SI-NEXT: buffer_wbinvl1 3693; SI-NEXT: v_readlane_b32 s7, v1, 1 3694; SI-NEXT: v_readlane_b32 s6, v1, 0 3695; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3696; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3697; SI-NEXT: s_mov_b64 exec, s[34:35] 3698; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3699; SI-NEXT: s_setpc_b64 s[30:31] 3700; 3701; VI-LABEL: global_atomic_xor_i32_noret_offset_scalar: 3702; VI: ; %bb.0: 3703; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3704; VI-NEXT: s_add_u32 s34, s4, 16 3705; VI-NEXT: s_addc_u32 s35, s5, 0 3706; VI-NEXT: v_mov_b32_e32 v0, s34 3707; VI-NEXT: v_mov_b32_e32 v1, s35 3708; VI-NEXT: v_mov_b32_e32 v2, s6 3709; VI-NEXT: flat_atomic_xor v[0:1], v2 3710; VI-NEXT: s_waitcnt vmcnt(0) 3711; VI-NEXT: buffer_wbinvl1_vol 3712; VI-NEXT: s_setpc_b64 s[30:31] 3713; 3714; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar: 3715; GFX9: ; %bb.0: 3716; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3717; GFX9-NEXT: v_mov_b32_e32 v0, 0 3718; GFX9-NEXT: v_mov_b32_e32 v1, s6 3719; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16 3720; GFX9-NEXT: s_waitcnt vmcnt(0) 3721; GFX9-NEXT: buffer_wbinvl1_vol 3722; GFX9-NEXT: s_setpc_b64 s[30:31] 3723 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3724 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst 3725 ret void 3726} 3727 3728define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 3729; SI-LABEL: global_atomic_xor_i32_ret_scalar: 3730; SI: ; %bb.0: 3731; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3732; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3733; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3734; SI-NEXT: s_mov_b64 exec, s[34:35] 3735; SI-NEXT: s_waitcnt expcnt(0) 3736; SI-NEXT: v_writelane_b32 v1, s6, 0 3737; SI-NEXT: v_writelane_b32 v1, s7, 1 3738; SI-NEXT: s_mov_b32 s34, s6 3739; SI-NEXT: s_mov_b32 s7, 0xf000 3740; SI-NEXT: s_mov_b32 s6, -1 3741; SI-NEXT: v_mov_b32_e32 v0, s34 3742; SI-NEXT: s_waitcnt vmcnt(0) 3743; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc 3744; SI-NEXT: s_waitcnt vmcnt(0) 3745; SI-NEXT: buffer_wbinvl1 3746; SI-NEXT: v_readlane_b32 s7, v1, 1 3747; SI-NEXT: v_readlane_b32 s6, v1, 0 3748; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3749; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3750; SI-NEXT: s_mov_b64 exec, s[34:35] 3751; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3752; SI-NEXT: s_setpc_b64 s[30:31] 3753; 3754; VI-LABEL: global_atomic_xor_i32_ret_scalar: 3755; VI: ; %bb.0: 3756; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3757; VI-NEXT: v_mov_b32_e32 v0, s4 3758; VI-NEXT: v_mov_b32_e32 v1, s5 3759; VI-NEXT: v_mov_b32_e32 v2, s6 3760; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3761; VI-NEXT: s_waitcnt vmcnt(0) 3762; VI-NEXT: buffer_wbinvl1_vol 3763; VI-NEXT: s_setpc_b64 s[30:31] 3764; 3765; GFX9-LABEL: global_atomic_xor_i32_ret_scalar: 3766; GFX9: ; %bb.0: 3767; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3768; GFX9-NEXT: v_mov_b32_e32 v0, 0 3769; GFX9-NEXT: v_mov_b32_e32 v1, s6 3770; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc 3771; GFX9-NEXT: s_waitcnt vmcnt(0) 3772; GFX9-NEXT: buffer_wbinvl1_vol 3773; GFX9-NEXT: s_setpc_b64 s[30:31] 3774 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst 3775 ret i32 %result 3776} 3777 3778define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 3779; SI-LABEL: global_atomic_xor_i32_ret_offset_scalar: 3780; SI: ; %bb.0: 3781; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3782; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3783; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 3784; SI-NEXT: s_mov_b64 exec, s[34:35] 3785; SI-NEXT: s_waitcnt expcnt(0) 3786; SI-NEXT: v_writelane_b32 v1, s6, 0 3787; SI-NEXT: v_writelane_b32 v1, s7, 1 3788; SI-NEXT: s_mov_b32 s34, s6 3789; SI-NEXT: s_mov_b32 s7, 0xf000 3790; SI-NEXT: s_mov_b32 s6, -1 3791; SI-NEXT: v_mov_b32_e32 v0, s34 3792; SI-NEXT: s_waitcnt vmcnt(0) 3793; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc 3794; SI-NEXT: s_waitcnt vmcnt(0) 3795; SI-NEXT: buffer_wbinvl1 3796; SI-NEXT: v_readlane_b32 s7, v1, 1 3797; SI-NEXT: v_readlane_b32 s6, v1, 0 3798; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3799; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 3800; SI-NEXT: s_mov_b64 exec, s[34:35] 3801; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3802; SI-NEXT: s_setpc_b64 s[30:31] 3803; 3804; VI-LABEL: global_atomic_xor_i32_ret_offset_scalar: 3805; VI: ; %bb.0: 3806; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3807; VI-NEXT: s_add_u32 s34, s4, 16 3808; VI-NEXT: s_addc_u32 s35, s5, 0 3809; VI-NEXT: v_mov_b32_e32 v0, s34 3810; VI-NEXT: v_mov_b32_e32 v1, s35 3811; VI-NEXT: v_mov_b32_e32 v2, s6 3812; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3813; VI-NEXT: s_waitcnt vmcnt(0) 3814; VI-NEXT: buffer_wbinvl1_vol 3815; VI-NEXT: s_setpc_b64 s[30:31] 3816; 3817; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar: 3818; GFX9: ; %bb.0: 3819; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3820; GFX9-NEXT: v_mov_b32_e32 v0, 0 3821; GFX9-NEXT: v_mov_b32_e32 v1, s6 3822; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc 3823; GFX9-NEXT: s_waitcnt vmcnt(0) 3824; GFX9-NEXT: buffer_wbinvl1_vol 3825; GFX9-NEXT: s_setpc_b64 s[30:31] 3826 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 3827 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst 3828 ret i32 %result 3829} 3830 3831define i32 @global_atomic_xor_0_i32_ret(ptr addrspace(1) %ptr) { 3832; SI-LABEL: global_atomic_xor_0_i32_ret: 3833; SI: ; %bb.0: 3834; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3835; SI-NEXT: s_mov_b32 s7, 0xf000 3836; SI-NEXT: s_mov_b32 s6, 0 3837; SI-NEXT: v_mov_b32_e32 v2, 0 3838; SI-NEXT: s_mov_b32 s4, s6 3839; SI-NEXT: s_mov_b32 s5, s6 3840; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc 3841; SI-NEXT: s_waitcnt vmcnt(0) 3842; SI-NEXT: buffer_wbinvl1 3843; SI-NEXT: v_mov_b32_e32 v0, v2 3844; SI-NEXT: s_waitcnt expcnt(0) 3845; SI-NEXT: s_setpc_b64 s[30:31] 3846; 3847; VI-LABEL: global_atomic_xor_0_i32_ret: 3848; VI: ; %bb.0: 3849; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3850; VI-NEXT: v_mov_b32_e32 v2, 0 3851; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc 3852; VI-NEXT: s_waitcnt vmcnt(0) 3853; VI-NEXT: buffer_wbinvl1_vol 3854; VI-NEXT: s_setpc_b64 s[30:31] 3855; 3856; GFX9-LABEL: global_atomic_xor_0_i32_ret: 3857; GFX9: ; %bb.0: 3858; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3859; GFX9-NEXT: v_mov_b32_e32 v2, 0 3860; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc 3861; GFX9-NEXT: s_waitcnt vmcnt(0) 3862; GFX9-NEXT: buffer_wbinvl1_vol 3863; GFX9-NEXT: s_setpc_b64 s[30:31] 3864 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst 3865 ret i32 %result 3866} 3867 3868define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 3869; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory: 3870; SI: ; %bb.0: 3871; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3872; SI-NEXT: s_mov_b32 s6, 0 3873; SI-NEXT: s_mov_b32 s7, 0xf000 3874; SI-NEXT: s_mov_b32 s4, s6 3875; SI-NEXT: s_mov_b32 s5, s6 3876; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 3877; SI-NEXT: s_waitcnt vmcnt(0) 3878; SI-NEXT: buffer_wbinvl1 3879; SI-NEXT: s_waitcnt expcnt(0) 3880; SI-NEXT: s_setpc_b64 s[30:31] 3881; 3882; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory: 3883; VI: ; %bb.0: 3884; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3885; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3886; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3887; VI-NEXT: flat_atomic_xor v[0:1], v2 3888; VI-NEXT: s_waitcnt vmcnt(0) 3889; VI-NEXT: buffer_wbinvl1_vol 3890; VI-NEXT: s_setpc_b64 s[30:31] 3891; 3892; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory: 3893; GFX9: ; %bb.0: 3894; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3895; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 3896; GFX9-NEXT: s_waitcnt vmcnt(0) 3897; GFX9-NEXT: buffer_wbinvl1_vol 3898; GFX9-NEXT: s_setpc_b64 s[30:31] 3899 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 3900 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3901 ret void 3902} 3903 3904define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 3905; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: 3906; SI: ; %bb.0: 3907; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3908; SI-NEXT: s_mov_b32 s6, 0 3909; SI-NEXT: s_mov_b32 s7, 0xf000 3910; SI-NEXT: s_mov_b32 s4, s6 3911; SI-NEXT: s_mov_b32 s5, s6 3912; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 3913; SI-NEXT: s_waitcnt vmcnt(0) 3914; SI-NEXT: buffer_wbinvl1 3915; SI-NEXT: v_mov_b32_e32 v0, v2 3916; SI-NEXT: s_waitcnt expcnt(0) 3917; SI-NEXT: s_setpc_b64 s[30:31] 3918; 3919; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: 3920; VI: ; %bb.0: 3921; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3922; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3923; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3924; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3925; VI-NEXT: s_waitcnt vmcnt(0) 3926; VI-NEXT: buffer_wbinvl1_vol 3927; VI-NEXT: s_setpc_b64 s[30:31] 3928; 3929; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: 3930; GFX9: ; %bb.0: 3931; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3932; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc 3933; GFX9-NEXT: s_waitcnt vmcnt(0) 3934; GFX9-NEXT: buffer_wbinvl1_vol 3935; GFX9-NEXT: s_setpc_b64 s[30:31] 3936 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 3937 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3938 ret i32 %result 3939} 3940 3941; --------------------------------------------------------------------- 3942; atomicrmw max 3943; --------------------------------------------------------------------- 3944 3945define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 3946; SI-LABEL: global_atomic_max_i32_noret: 3947; SI: ; %bb.0: 3948; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3949; SI-NEXT: s_mov_b32 s6, 0 3950; SI-NEXT: s_mov_b32 s7, 0xf000 3951; SI-NEXT: s_mov_b32 s4, s6 3952; SI-NEXT: s_mov_b32 s5, s6 3953; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 3954; SI-NEXT: s_mov_b64 s[8:9], 0 3955; SI-NEXT: .LBB83_1: ; %atomicrmw.start 3956; SI-NEXT: ; =>This Inner Loop Header: Depth=1 3957; SI-NEXT: s_waitcnt vmcnt(0) 3958; SI-NEXT: v_max_i32_e32 v3, v4, v2 3959; SI-NEXT: s_waitcnt expcnt(0) 3960; SI-NEXT: v_mov_b32_e32 v6, v4 3961; SI-NEXT: v_mov_b32_e32 v5, v3 3962; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc 3963; SI-NEXT: s_waitcnt vmcnt(0) 3964; SI-NEXT: buffer_wbinvl1 3965; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 3966; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 3967; SI-NEXT: v_mov_b32_e32 v4, v5 3968; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 3969; SI-NEXT: s_cbranch_execnz .LBB83_1 3970; SI-NEXT: ; %bb.2: ; %atomicrmw.end 3971; SI-NEXT: s_or_b64 exec, exec, s[8:9] 3972; SI-NEXT: s_waitcnt expcnt(0) 3973; SI-NEXT: s_setpc_b64 s[30:31] 3974; 3975; VI-LABEL: global_atomic_max_i32_noret: 3976; VI: ; %bb.0: 3977; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3978; VI-NEXT: flat_load_dword v4, v[0:1] 3979; VI-NEXT: s_mov_b64 s[4:5], 0 3980; VI-NEXT: .LBB83_1: ; %atomicrmw.start 3981; VI-NEXT: ; =>This Inner Loop Header: Depth=1 3982; VI-NEXT: s_waitcnt vmcnt(0) 3983; VI-NEXT: v_max_i32_e32 v3, v4, v2 3984; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3985; VI-NEXT: s_waitcnt vmcnt(0) 3986; VI-NEXT: buffer_wbinvl1_vol 3987; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3988; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3989; VI-NEXT: v_mov_b32_e32 v4, v3 3990; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 3991; VI-NEXT: s_cbranch_execnz .LBB83_1 3992; VI-NEXT: ; %bb.2: ; %atomicrmw.end 3993; VI-NEXT: s_or_b64 exec, exec, s[4:5] 3994; VI-NEXT: s_setpc_b64 s[30:31] 3995; 3996; GFX9-LABEL: global_atomic_max_i32_noret: 3997; GFX9: ; %bb.0: 3998; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3999; GFX9-NEXT: global_load_dword v4, v[0:1], off 4000; GFX9-NEXT: s_mov_b64 s[4:5], 0 4001; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start 4002; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4003; GFX9-NEXT: s_waitcnt vmcnt(0) 4004; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 4005; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 4006; GFX9-NEXT: s_waitcnt vmcnt(0) 4007; GFX9-NEXT: buffer_wbinvl1_vol 4008; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4009; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4010; GFX9-NEXT: v_mov_b32_e32 v4, v3 4011; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4012; GFX9-NEXT: s_cbranch_execnz .LBB83_1 4013; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4014; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4015; GFX9-NEXT: s_setpc_b64 s[30:31] 4016 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst 4017 ret void 4018} 4019 4020define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 4021; SI-LABEL: global_atomic_max_i32_noret_offset: 4022; SI: ; %bb.0: 4023; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4024; SI-NEXT: s_mov_b32 s6, 0 4025; SI-NEXT: s_mov_b32 s7, 0xf000 4026; SI-NEXT: s_mov_b32 s4, s6 4027; SI-NEXT: s_mov_b32 s5, s6 4028; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 4029; SI-NEXT: s_mov_b64 s[8:9], 0 4030; SI-NEXT: .LBB84_1: ; %atomicrmw.start 4031; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4032; SI-NEXT: s_waitcnt vmcnt(0) 4033; SI-NEXT: v_max_i32_e32 v3, v4, v2 4034; SI-NEXT: s_waitcnt expcnt(0) 4035; SI-NEXT: v_mov_b32_e32 v6, v4 4036; SI-NEXT: v_mov_b32_e32 v5, v3 4037; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 4038; SI-NEXT: s_waitcnt vmcnt(0) 4039; SI-NEXT: buffer_wbinvl1 4040; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 4041; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4042; SI-NEXT: v_mov_b32_e32 v4, v5 4043; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4044; SI-NEXT: s_cbranch_execnz .LBB84_1 4045; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4046; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4047; SI-NEXT: s_waitcnt expcnt(0) 4048; SI-NEXT: s_setpc_b64 s[30:31] 4049; 4050; VI-LABEL: global_atomic_max_i32_noret_offset: 4051; VI: ; %bb.0: 4052; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4053; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 4054; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4055; VI-NEXT: flat_load_dword v4, v[0:1] 4056; VI-NEXT: s_mov_b64 s[4:5], 0 4057; VI-NEXT: .LBB84_1: ; %atomicrmw.start 4058; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4059; VI-NEXT: s_waitcnt vmcnt(0) 4060; VI-NEXT: v_max_i32_e32 v3, v4, v2 4061; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4062; VI-NEXT: s_waitcnt vmcnt(0) 4063; VI-NEXT: buffer_wbinvl1_vol 4064; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4065; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4066; VI-NEXT: v_mov_b32_e32 v4, v3 4067; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4068; VI-NEXT: s_cbranch_execnz .LBB84_1 4069; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4070; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4071; VI-NEXT: s_setpc_b64 s[30:31] 4072; 4073; GFX9-LABEL: global_atomic_max_i32_noret_offset: 4074; GFX9: ; %bb.0: 4075; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4076; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 4077; GFX9-NEXT: s_mov_b64 s[4:5], 0 4078; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start 4079; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4080; GFX9-NEXT: s_waitcnt vmcnt(0) 4081; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 4082; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 4083; GFX9-NEXT: s_waitcnt vmcnt(0) 4084; GFX9-NEXT: buffer_wbinvl1_vol 4085; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4086; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4087; GFX9-NEXT: v_mov_b32_e32 v4, v3 4088; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4089; GFX9-NEXT: s_cbranch_execnz .LBB84_1 4090; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4091; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4092; GFX9-NEXT: s_setpc_b64 s[30:31] 4093 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 4094 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst 4095 ret void 4096} 4097 4098define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 4099; SI-LABEL: global_atomic_max_i32_ret: 4100; SI: ; %bb.0: 4101; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4102; SI-NEXT: s_mov_b32 s6, 0 4103; SI-NEXT: s_mov_b32 s7, 0xf000 4104; SI-NEXT: s_mov_b32 s4, s6 4105; SI-NEXT: s_mov_b32 s5, s6 4106; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 4107; SI-NEXT: s_mov_b64 s[8:9], 0 4108; SI-NEXT: .LBB85_1: ; %atomicrmw.start 4109; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4110; SI-NEXT: s_waitcnt vmcnt(0) 4111; SI-NEXT: v_mov_b32_e32 v5, v3 4112; SI-NEXT: s_waitcnt expcnt(0) 4113; SI-NEXT: v_max_i32_e32 v4, v5, v2 4114; SI-NEXT: v_mov_b32_e32 v3, v4 4115; SI-NEXT: v_mov_b32_e32 v4, v5 4116; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc 4117; SI-NEXT: s_waitcnt vmcnt(0) 4118; SI-NEXT: buffer_wbinvl1 4119; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 4120; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4121; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4122; SI-NEXT: s_cbranch_execnz .LBB85_1 4123; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4124; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4125; SI-NEXT: v_mov_b32_e32 v0, v3 4126; SI-NEXT: s_waitcnt expcnt(0) 4127; SI-NEXT: s_setpc_b64 s[30:31] 4128; 4129; VI-LABEL: global_atomic_max_i32_ret: 4130; VI: ; %bb.0: 4131; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4132; VI-NEXT: flat_load_dword v3, v[0:1] 4133; VI-NEXT: s_mov_b64 s[4:5], 0 4134; VI-NEXT: .LBB85_1: ; %atomicrmw.start 4135; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4136; VI-NEXT: s_waitcnt vmcnt(0) 4137; VI-NEXT: v_mov_b32_e32 v4, v3 4138; VI-NEXT: v_max_i32_e32 v3, v4, v2 4139; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4140; VI-NEXT: s_waitcnt vmcnt(0) 4141; VI-NEXT: buffer_wbinvl1_vol 4142; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4143; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4144; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4145; VI-NEXT: s_cbranch_execnz .LBB85_1 4146; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4147; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4148; VI-NEXT: v_mov_b32_e32 v0, v3 4149; VI-NEXT: s_setpc_b64 s[30:31] 4150; 4151; GFX9-LABEL: global_atomic_max_i32_ret: 4152; GFX9: ; %bb.0: 4153; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4154; GFX9-NEXT: global_load_dword v3, v[0:1], off 4155; GFX9-NEXT: s_mov_b64 s[4:5], 0 4156; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start 4157; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4158; GFX9-NEXT: s_waitcnt vmcnt(0) 4159; GFX9-NEXT: v_mov_b32_e32 v4, v3 4160; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 4161; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 4162; GFX9-NEXT: s_waitcnt vmcnt(0) 4163; GFX9-NEXT: buffer_wbinvl1_vol 4164; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4165; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4166; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4167; GFX9-NEXT: s_cbranch_execnz .LBB85_1 4168; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4169; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4170; GFX9-NEXT: v_mov_b32_e32 v0, v3 4171; GFX9-NEXT: s_setpc_b64 s[30:31] 4172 %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst 4173 ret i32 %result 4174} 4175 4176define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 4177; SI-LABEL: global_atomic_max_i32_ret_offset: 4178; SI: ; %bb.0: 4179; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4180; SI-NEXT: s_mov_b32 s6, 0 4181; SI-NEXT: s_mov_b32 s7, 0xf000 4182; SI-NEXT: s_mov_b32 s4, s6 4183; SI-NEXT: s_mov_b32 s5, s6 4184; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 4185; SI-NEXT: s_mov_b64 s[8:9], 0 4186; SI-NEXT: .LBB86_1: ; %atomicrmw.start 4187; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4188; SI-NEXT: s_waitcnt vmcnt(0) 4189; SI-NEXT: v_mov_b32_e32 v5, v3 4190; SI-NEXT: s_waitcnt expcnt(0) 4191; SI-NEXT: v_max_i32_e32 v4, v5, v2 4192; SI-NEXT: v_mov_b32_e32 v3, v4 4193; SI-NEXT: v_mov_b32_e32 v4, v5 4194; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 4195; SI-NEXT: s_waitcnt vmcnt(0) 4196; SI-NEXT: buffer_wbinvl1 4197; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 4198; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4199; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4200; SI-NEXT: s_cbranch_execnz .LBB86_1 4201; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4202; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4203; SI-NEXT: v_mov_b32_e32 v0, v3 4204; SI-NEXT: s_waitcnt expcnt(0) 4205; SI-NEXT: s_setpc_b64 s[30:31] 4206; 4207; VI-LABEL: global_atomic_max_i32_ret_offset: 4208; VI: ; %bb.0: 4209; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4210; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 4211; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4212; VI-NEXT: flat_load_dword v0, v[3:4] 4213; VI-NEXT: s_mov_b64 s[4:5], 0 4214; VI-NEXT: .LBB86_1: ; %atomicrmw.start 4215; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4216; VI-NEXT: s_waitcnt vmcnt(0) 4217; VI-NEXT: v_mov_b32_e32 v1, v0 4218; VI-NEXT: v_max_i32_e32 v0, v1, v2 4219; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4220; VI-NEXT: s_waitcnt vmcnt(0) 4221; VI-NEXT: buffer_wbinvl1_vol 4222; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4223; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4224; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4225; VI-NEXT: s_cbranch_execnz .LBB86_1 4226; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4227; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4228; VI-NEXT: s_setpc_b64 s[30:31] 4229; 4230; GFX9-LABEL: global_atomic_max_i32_ret_offset: 4231; GFX9: ; %bb.0: 4232; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4233; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 4234; GFX9-NEXT: s_mov_b64 s[4:5], 0 4235; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start 4236; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4237; GFX9-NEXT: s_waitcnt vmcnt(0) 4238; GFX9-NEXT: v_mov_b32_e32 v4, v3 4239; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 4240; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 4241; GFX9-NEXT: s_waitcnt vmcnt(0) 4242; GFX9-NEXT: buffer_wbinvl1_vol 4243; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4244; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4245; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4246; GFX9-NEXT: s_cbranch_execnz .LBB86_1 4247; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4248; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4249; GFX9-NEXT: v_mov_b32_e32 v0, v3 4250; GFX9-NEXT: s_setpc_b64 s[30:31] 4251 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 4252 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst 4253 ret i32 %result 4254} 4255 4256define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 4257; SI-LABEL: global_atomic_max_i32_noret_scalar: 4258; SI: ; %bb.0: 4259; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4260; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4261; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 4262; SI-NEXT: s_mov_b64 exec, s[34:35] 4263; SI-NEXT: s_waitcnt expcnt(0) 4264; SI-NEXT: v_writelane_b32 v4, s6, 0 4265; SI-NEXT: v_writelane_b32 v4, s7, 1 4266; SI-NEXT: s_mov_b32 s34, s6 4267; SI-NEXT: s_mov_b32 s7, 0xf000 4268; SI-NEXT: s_mov_b32 s6, -1 4269; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 4270; SI-NEXT: s_mov_b64 s[36:37], 0 4271; SI-NEXT: .LBB87_1: ; %atomicrmw.start 4272; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4273; SI-NEXT: s_waitcnt vmcnt(0) 4274; SI-NEXT: v_max_i32_e32 v0, s34, v1 4275; SI-NEXT: s_waitcnt expcnt(0) 4276; SI-NEXT: v_mov_b32_e32 v3, v1 4277; SI-NEXT: v_mov_b32_e32 v2, v0 4278; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 4279; SI-NEXT: s_waitcnt vmcnt(0) 4280; SI-NEXT: buffer_wbinvl1 4281; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4282; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4283; SI-NEXT: v_mov_b32_e32 v1, v2 4284; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4285; SI-NEXT: s_cbranch_execnz .LBB87_1 4286; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4287; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4288; SI-NEXT: v_readlane_b32 s7, v4, 1 4289; SI-NEXT: v_readlane_b32 s6, v4, 0 4290; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4291; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 4292; SI-NEXT: s_mov_b64 exec, s[34:35] 4293; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4294; SI-NEXT: s_setpc_b64 s[30:31] 4295; 4296; VI-LABEL: global_atomic_max_i32_noret_scalar: 4297; VI: ; %bb.0: 4298; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4299; VI-NEXT: v_mov_b32_e32 v0, s4 4300; VI-NEXT: v_mov_b32_e32 v1, s5 4301; VI-NEXT: flat_load_dword v3, v[0:1] 4302; VI-NEXT: s_mov_b64 s[34:35], 0 4303; VI-NEXT: .LBB87_1: ; %atomicrmw.start 4304; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4305; VI-NEXT: s_waitcnt vmcnt(0) 4306; VI-NEXT: v_max_i32_e32 v2, s6, v3 4307; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4308; VI-NEXT: s_waitcnt vmcnt(0) 4309; VI-NEXT: buffer_wbinvl1_vol 4310; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4311; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4312; VI-NEXT: v_mov_b32_e32 v3, v2 4313; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4314; VI-NEXT: s_cbranch_execnz .LBB87_1 4315; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4316; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4317; VI-NEXT: s_setpc_b64 s[30:31] 4318; 4319; GFX9-LABEL: global_atomic_max_i32_noret_scalar: 4320; GFX9: ; %bb.0: 4321; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4322; GFX9-NEXT: v_mov_b32_e32 v2, 0 4323; GFX9-NEXT: global_load_dword v1, v2, s[4:5] 4324; GFX9-NEXT: s_mov_b64 s[34:35], 0 4325; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start 4326; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4327; GFX9-NEXT: s_waitcnt vmcnt(0) 4328; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 4329; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc 4330; GFX9-NEXT: s_waitcnt vmcnt(0) 4331; GFX9-NEXT: buffer_wbinvl1_vol 4332; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4333; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4334; GFX9-NEXT: v_mov_b32_e32 v1, v0 4335; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4336; GFX9-NEXT: s_cbranch_execnz .LBB87_1 4337; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4338; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4339; GFX9-NEXT: s_setpc_b64 s[30:31] 4340 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst 4341 ret void 4342} 4343 4344define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 4345; SI-LABEL: global_atomic_max_i32_noret_offset_scalar: 4346; SI: ; %bb.0: 4347; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4348; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4349; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 4350; SI-NEXT: s_mov_b64 exec, s[34:35] 4351; SI-NEXT: s_waitcnt expcnt(0) 4352; SI-NEXT: v_writelane_b32 v4, s6, 0 4353; SI-NEXT: v_writelane_b32 v4, s7, 1 4354; SI-NEXT: s_mov_b32 s34, s6 4355; SI-NEXT: s_mov_b32 s7, 0xf000 4356; SI-NEXT: s_mov_b32 s6, -1 4357; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 4358; SI-NEXT: s_mov_b64 s[36:37], 0 4359; SI-NEXT: .LBB88_1: ; %atomicrmw.start 4360; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4361; SI-NEXT: s_waitcnt vmcnt(0) 4362; SI-NEXT: v_max_i32_e32 v0, s34, v1 4363; SI-NEXT: s_waitcnt expcnt(0) 4364; SI-NEXT: v_mov_b32_e32 v3, v1 4365; SI-NEXT: v_mov_b32_e32 v2, v0 4366; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 4367; SI-NEXT: s_waitcnt vmcnt(0) 4368; SI-NEXT: buffer_wbinvl1 4369; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4370; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4371; SI-NEXT: v_mov_b32_e32 v1, v2 4372; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4373; SI-NEXT: s_cbranch_execnz .LBB88_1 4374; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4375; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4376; SI-NEXT: v_readlane_b32 s7, v4, 1 4377; SI-NEXT: v_readlane_b32 s6, v4, 0 4378; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4379; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 4380; SI-NEXT: s_mov_b64 exec, s[34:35] 4381; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4382; SI-NEXT: s_setpc_b64 s[30:31] 4383; 4384; VI-LABEL: global_atomic_max_i32_noret_offset_scalar: 4385; VI: ; %bb.0: 4386; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4387; VI-NEXT: s_add_u32 s34, s4, 16 4388; VI-NEXT: s_addc_u32 s35, s5, 0 4389; VI-NEXT: v_mov_b32_e32 v0, s34 4390; VI-NEXT: v_mov_b32_e32 v1, s35 4391; VI-NEXT: flat_load_dword v3, v[0:1] 4392; VI-NEXT: s_mov_b64 s[34:35], 0 4393; VI-NEXT: .LBB88_1: ; %atomicrmw.start 4394; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4395; VI-NEXT: s_waitcnt vmcnt(0) 4396; VI-NEXT: v_max_i32_e32 v2, s6, v3 4397; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4398; VI-NEXT: s_waitcnt vmcnt(0) 4399; VI-NEXT: buffer_wbinvl1_vol 4400; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4401; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4402; VI-NEXT: v_mov_b32_e32 v3, v2 4403; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4404; VI-NEXT: s_cbranch_execnz .LBB88_1 4405; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4406; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4407; VI-NEXT: s_setpc_b64 s[30:31] 4408; 4409; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar: 4410; GFX9: ; %bb.0: 4411; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4412; GFX9-NEXT: v_mov_b32_e32 v2, 0 4413; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 4414; GFX9-NEXT: s_mov_b64 s[34:35], 0 4415; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start 4416; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4417; GFX9-NEXT: s_waitcnt vmcnt(0) 4418; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 4419; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc 4420; GFX9-NEXT: s_waitcnt vmcnt(0) 4421; GFX9-NEXT: buffer_wbinvl1_vol 4422; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4423; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4424; GFX9-NEXT: v_mov_b32_e32 v1, v0 4425; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4426; GFX9-NEXT: s_cbranch_execnz .LBB88_1 4427; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4428; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4429; GFX9-NEXT: s_setpc_b64 s[30:31] 4430 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 4431 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst 4432 ret void 4433} 4434 4435define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 4436; SI-LABEL: global_atomic_max_i32_ret_scalar: 4437; SI: ; %bb.0: 4438; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4439; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4440; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 4441; SI-NEXT: s_mov_b64 exec, s[34:35] 4442; SI-NEXT: s_waitcnt expcnt(0) 4443; SI-NEXT: v_writelane_b32 v3, s6, 0 4444; SI-NEXT: v_writelane_b32 v3, s7, 1 4445; SI-NEXT: s_mov_b32 s34, s6 4446; SI-NEXT: s_mov_b32 s7, 0xf000 4447; SI-NEXT: s_mov_b32 s6, -1 4448; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 4449; SI-NEXT: s_mov_b64 s[36:37], 0 4450; SI-NEXT: .LBB89_1: ; %atomicrmw.start 4451; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4452; SI-NEXT: s_waitcnt vmcnt(0) 4453; SI-NEXT: v_mov_b32_e32 v2, v0 4454; SI-NEXT: s_waitcnt expcnt(0) 4455; SI-NEXT: v_max_i32_e32 v1, s34, v2 4456; SI-NEXT: v_mov_b32_e32 v0, v1 4457; SI-NEXT: v_mov_b32_e32 v1, v2 4458; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc 4459; SI-NEXT: s_waitcnt vmcnt(0) 4460; SI-NEXT: buffer_wbinvl1 4461; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 4462; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4463; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4464; SI-NEXT: s_cbranch_execnz .LBB89_1 4465; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4466; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4467; SI-NEXT: v_readlane_b32 s7, v3, 1 4468; SI-NEXT: v_readlane_b32 s6, v3, 0 4469; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4470; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 4471; SI-NEXT: s_mov_b64 exec, s[34:35] 4472; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4473; SI-NEXT: s_setpc_b64 s[30:31] 4474; 4475; VI-LABEL: global_atomic_max_i32_ret_scalar: 4476; VI: ; %bb.0: 4477; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4478; VI-NEXT: v_mov_b32_e32 v0, s4 4479; VI-NEXT: v_mov_b32_e32 v1, s5 4480; VI-NEXT: flat_load_dword v0, v[0:1] 4481; VI-NEXT: v_mov_b32_e32 v1, s4 4482; VI-NEXT: s_mov_b64 s[34:35], 0 4483; VI-NEXT: v_mov_b32_e32 v2, s5 4484; VI-NEXT: .LBB89_1: ; %atomicrmw.start 4485; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4486; VI-NEXT: s_waitcnt vmcnt(0) 4487; VI-NEXT: v_mov_b32_e32 v4, v0 4488; VI-NEXT: v_max_i32_e32 v3, s6, v4 4489; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4490; VI-NEXT: s_waitcnt vmcnt(0) 4491; VI-NEXT: buffer_wbinvl1_vol 4492; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4493; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4494; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4495; VI-NEXT: s_cbranch_execnz .LBB89_1 4496; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4497; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4498; VI-NEXT: s_setpc_b64 s[30:31] 4499; 4500; GFX9-LABEL: global_atomic_max_i32_ret_scalar: 4501; GFX9: ; %bb.0: 4502; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4503; GFX9-NEXT: v_mov_b32_e32 v1, 0 4504; GFX9-NEXT: global_load_dword v0, v1, s[4:5] 4505; GFX9-NEXT: s_mov_b64 s[34:35], 0 4506; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start 4507; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4508; GFX9-NEXT: s_waitcnt vmcnt(0) 4509; GFX9-NEXT: v_mov_b32_e32 v3, v0 4510; GFX9-NEXT: v_max_i32_e32 v2, s6, v3 4511; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc 4512; GFX9-NEXT: s_waitcnt vmcnt(0) 4513; GFX9-NEXT: buffer_wbinvl1_vol 4514; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 4515; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4516; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4517; GFX9-NEXT: s_cbranch_execnz .LBB89_1 4518; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4519; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4520; GFX9-NEXT: s_setpc_b64 s[30:31] 4521 %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst 4522 ret i32 %result 4523} 4524 4525define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 4526; SI-LABEL: global_atomic_max_i32_ret_offset_scalar: 4527; SI: ; %bb.0: 4528; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4529; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4530; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 4531; SI-NEXT: s_mov_b64 exec, s[34:35] 4532; SI-NEXT: s_waitcnt expcnt(0) 4533; SI-NEXT: v_writelane_b32 v3, s6, 0 4534; SI-NEXT: v_writelane_b32 v3, s7, 1 4535; SI-NEXT: s_mov_b32 s34, s6 4536; SI-NEXT: s_mov_b32 s7, 0xf000 4537; SI-NEXT: s_mov_b32 s6, -1 4538; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 4539; SI-NEXT: s_mov_b64 s[36:37], 0 4540; SI-NEXT: .LBB90_1: ; %atomicrmw.start 4541; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4542; SI-NEXT: s_waitcnt vmcnt(0) 4543; SI-NEXT: v_mov_b32_e32 v2, v0 4544; SI-NEXT: s_waitcnt expcnt(0) 4545; SI-NEXT: v_max_i32_e32 v1, s34, v2 4546; SI-NEXT: v_mov_b32_e32 v0, v1 4547; SI-NEXT: v_mov_b32_e32 v1, v2 4548; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4549; SI-NEXT: s_waitcnt vmcnt(0) 4550; SI-NEXT: buffer_wbinvl1 4551; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 4552; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4553; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4554; SI-NEXT: s_cbranch_execnz .LBB90_1 4555; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4556; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4557; SI-NEXT: v_readlane_b32 s7, v3, 1 4558; SI-NEXT: v_readlane_b32 s6, v3, 0 4559; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4560; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 4561; SI-NEXT: s_mov_b64 exec, s[34:35] 4562; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4563; SI-NEXT: s_setpc_b64 s[30:31] 4564; 4565; VI-LABEL: global_atomic_max_i32_ret_offset_scalar: 4566; VI: ; %bb.0: 4567; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4568; VI-NEXT: s_add_u32 s34, s4, 16 4569; VI-NEXT: s_addc_u32 s35, s5, 0 4570; VI-NEXT: v_mov_b32_e32 v1, s34 4571; VI-NEXT: v_mov_b32_e32 v2, s35 4572; VI-NEXT: flat_load_dword v0, v[1:2] 4573; VI-NEXT: s_mov_b64 s[34:35], 0 4574; VI-NEXT: .LBB90_1: ; %atomicrmw.start 4575; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4576; VI-NEXT: s_waitcnt vmcnt(0) 4577; VI-NEXT: v_mov_b32_e32 v4, v0 4578; VI-NEXT: v_max_i32_e32 v3, s6, v4 4579; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4580; VI-NEXT: s_waitcnt vmcnt(0) 4581; VI-NEXT: buffer_wbinvl1_vol 4582; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4583; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4584; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4585; VI-NEXT: s_cbranch_execnz .LBB90_1 4586; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4587; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4588; VI-NEXT: s_setpc_b64 s[30:31] 4589; 4590; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar: 4591; GFX9: ; %bb.0: 4592; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4593; GFX9-NEXT: v_mov_b32_e32 v1, 0 4594; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 4595; GFX9-NEXT: s_mov_b64 s[34:35], 0 4596; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start 4597; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4598; GFX9-NEXT: s_waitcnt vmcnt(0) 4599; GFX9-NEXT: v_mov_b32_e32 v3, v0 4600; GFX9-NEXT: v_max_i32_e32 v2, s6, v3 4601; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc 4602; GFX9-NEXT: s_waitcnt vmcnt(0) 4603; GFX9-NEXT: buffer_wbinvl1_vol 4604; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 4605; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4606; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4607; GFX9-NEXT: s_cbranch_execnz .LBB90_1 4608; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4609; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4610; GFX9-NEXT: s_setpc_b64 s[30:31] 4611 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 4612 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst 4613 ret i32 %result 4614} 4615 4616define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { 4617; SI-LABEL: atomic_max_i32_addr64_offset: 4618; SI: ; %bb.0: ; %entry 4619; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4620; SI-NEXT: s_waitcnt lgkmcnt(0) 4621; SI-NEXT: s_ashr_i32 s5, s3, 31 4622; SI-NEXT: s_mov_b32 s4, s3 4623; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4624; SI-NEXT: s_add_u32 s4, s0, s4 4625; SI-NEXT: s_addc_u32 s5, s1, s5 4626; SI-NEXT: s_load_dword s3, s[4:5], 0x4 4627; SI-NEXT: s_mov_b64 s[0:1], 0 4628; SI-NEXT: s_mov_b32 s7, 0xf000 4629; SI-NEXT: s_waitcnt lgkmcnt(0) 4630; SI-NEXT: v_mov_b32_e32 v1, s3 4631; SI-NEXT: s_mov_b32 s6, -1 4632; SI-NEXT: .LBB91_1: ; %atomicrmw.start 4633; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4634; SI-NEXT: v_max_i32_e32 v0, s2, v1 4635; SI-NEXT: s_waitcnt expcnt(0) 4636; SI-NEXT: v_mov_b32_e32 v3, v1 4637; SI-NEXT: v_mov_b32_e32 v2, v0 4638; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 4639; SI-NEXT: s_waitcnt vmcnt(0) 4640; SI-NEXT: buffer_wbinvl1 4641; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4642; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4643; SI-NEXT: v_mov_b32_e32 v1, v2 4644; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4645; SI-NEXT: s_cbranch_execnz .LBB91_1 4646; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4647; SI-NEXT: s_endpgm 4648; 4649; VI-LABEL: atomic_max_i32_addr64_offset: 4650; VI: ; %bb.0: ; %entry 4651; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4652; VI-NEXT: s_waitcnt lgkmcnt(0) 4653; VI-NEXT: s_ashr_i32 s5, s3, 31 4654; VI-NEXT: s_mov_b32 s4, s3 4655; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4656; VI-NEXT: s_add_u32 s4, s0, s4 4657; VI-NEXT: s_addc_u32 s5, s1, s5 4658; VI-NEXT: s_load_dword s3, s[4:5], 0x10 4659; VI-NEXT: s_add_u32 s4, s4, 16 4660; VI-NEXT: s_addc_u32 s5, s5, 0 4661; VI-NEXT: v_mov_b32_e32 v0, s4 4662; VI-NEXT: s_mov_b64 s[0:1], 0 4663; VI-NEXT: s_waitcnt lgkmcnt(0) 4664; VI-NEXT: v_mov_b32_e32 v3, s3 4665; VI-NEXT: v_mov_b32_e32 v1, s5 4666; VI-NEXT: .LBB91_1: ; %atomicrmw.start 4667; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4668; VI-NEXT: v_max_i32_e32 v2, s2, v3 4669; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4670; VI-NEXT: s_waitcnt vmcnt(0) 4671; VI-NEXT: buffer_wbinvl1_vol 4672; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4673; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4674; VI-NEXT: v_mov_b32_e32 v3, v2 4675; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4676; VI-NEXT: s_cbranch_execnz .LBB91_1 4677; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4678; VI-NEXT: s_endpgm 4679; 4680; GFX9-LABEL: atomic_max_i32_addr64_offset: 4681; GFX9: ; %bb.0: ; %entry 4682; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4683; GFX9-NEXT: v_mov_b32_e32 v2, 0 4684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4685; GFX9-NEXT: s_ashr_i32 s5, s3, 31 4686; GFX9-NEXT: s_mov_b32 s4, s3 4687; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4688; GFX9-NEXT: s_add_u32 s0, s0, s4 4689; GFX9-NEXT: s_addc_u32 s1, s1, s5 4690; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 4691; GFX9-NEXT: s_mov_b64 s[4:5], 0 4692; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4693; GFX9-NEXT: v_mov_b32_e32 v1, s3 4694; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start 4695; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4696; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 4697; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc 4698; GFX9-NEXT: s_waitcnt vmcnt(0) 4699; GFX9-NEXT: buffer_wbinvl1_vol 4700; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4701; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4702; GFX9-NEXT: v_mov_b32_e32 v1, v0 4703; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4704; GFX9-NEXT: s_cbranch_execnz .LBB91_1 4705; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4706; GFX9-NEXT: s_endpgm 4707entry: 4708 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 4709 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 4710 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst 4711 ret void 4712} 4713 4714define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { 4715; SI-LABEL: atomic_max_i32_ret_addr64_offset: 4716; SI: ; %bb.0: ; %entry 4717; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 4718; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4719; SI-NEXT: s_waitcnt lgkmcnt(0) 4720; SI-NEXT: s_ashr_i32 s5, s9, 31 4721; SI-NEXT: s_mov_b32 s4, s9 4722; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4723; SI-NEXT: s_add_u32 s4, s0, s4 4724; SI-NEXT: s_addc_u32 s5, s1, s5 4725; SI-NEXT: s_load_dword s6, s[4:5], 0x4 4726; SI-NEXT: s_mov_b64 s[0:1], 0 4727; SI-NEXT: s_mov_b32 s7, 0xf000 4728; SI-NEXT: s_waitcnt lgkmcnt(0) 4729; SI-NEXT: v_mov_b32_e32 v1, s6 4730; SI-NEXT: s_mov_b32 s6, -1 4731; SI-NEXT: .LBB92_1: ; %atomicrmw.start 4732; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4733; SI-NEXT: v_max_i32_e32 v0, s8, v1 4734; SI-NEXT: s_waitcnt expcnt(0) 4735; SI-NEXT: v_mov_b32_e32 v3, v1 4736; SI-NEXT: v_mov_b32_e32 v2, v0 4737; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 4738; SI-NEXT: s_waitcnt vmcnt(0) 4739; SI-NEXT: buffer_wbinvl1 4740; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4741; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4742; SI-NEXT: v_mov_b32_e32 v1, v2 4743; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4744; SI-NEXT: s_cbranch_execnz .LBB92_1 4745; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4746; SI-NEXT: s_or_b64 exec, exec, s[0:1] 4747; SI-NEXT: s_mov_b32 s7, 0xf000 4748; SI-NEXT: s_mov_b32 s6, -1 4749; SI-NEXT: s_mov_b32 s4, s2 4750; SI-NEXT: s_mov_b32 s5, s3 4751; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 4752; SI-NEXT: s_endpgm 4753; 4754; VI-LABEL: atomic_max_i32_ret_addr64_offset: 4755; VI: ; %bb.0: ; %entry 4756; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4757; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4758; VI-NEXT: s_waitcnt lgkmcnt(0) 4759; VI-NEXT: s_ashr_i32 s5, s7, 31 4760; VI-NEXT: s_mov_b32 s4, s7 4761; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4762; VI-NEXT: s_add_u32 s4, s0, s4 4763; VI-NEXT: s_addc_u32 s5, s1, s5 4764; VI-NEXT: s_load_dword s7, s[4:5], 0x10 4765; VI-NEXT: s_add_u32 s4, s4, 16 4766; VI-NEXT: s_addc_u32 s5, s5, 0 4767; VI-NEXT: v_mov_b32_e32 v0, s4 4768; VI-NEXT: s_mov_b64 s[0:1], 0 4769; VI-NEXT: s_waitcnt lgkmcnt(0) 4770; VI-NEXT: v_mov_b32_e32 v2, s7 4771; VI-NEXT: v_mov_b32_e32 v1, s5 4772; VI-NEXT: .LBB92_1: ; %atomicrmw.start 4773; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4774; VI-NEXT: v_mov_b32_e32 v3, v2 4775; VI-NEXT: v_max_i32_e32 v2, s6, v3 4776; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4777; VI-NEXT: s_waitcnt vmcnt(0) 4778; VI-NEXT: buffer_wbinvl1_vol 4779; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4780; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4781; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4782; VI-NEXT: s_cbranch_execnz .LBB92_1 4783; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4784; VI-NEXT: s_or_b64 exec, exec, s[0:1] 4785; VI-NEXT: v_mov_b32_e32 v0, s2 4786; VI-NEXT: v_mov_b32_e32 v1, s3 4787; VI-NEXT: flat_store_dword v[0:1], v2 4788; VI-NEXT: s_endpgm 4789; 4790; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: 4791; GFX9: ; %bb.0: ; %entry 4792; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4793; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4794; GFX9-NEXT: v_mov_b32_e32 v1, 0 4795; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4796; GFX9-NEXT: s_ashr_i32 s5, s7, 31 4797; GFX9-NEXT: s_mov_b32 s4, s7 4798; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4799; GFX9-NEXT: s_add_u32 s0, s0, s4 4800; GFX9-NEXT: s_addc_u32 s1, s1, s5 4801; GFX9-NEXT: s_load_dword s7, s[0:1], 0x10 4802; GFX9-NEXT: s_mov_b64 s[4:5], 0 4803; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4804; GFX9-NEXT: v_mov_b32_e32 v0, s7 4805; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start 4806; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4807; GFX9-NEXT: v_mov_b32_e32 v3, v0 4808; GFX9-NEXT: v_max_i32_e32 v2, s6, v3 4809; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc 4810; GFX9-NEXT: s_waitcnt vmcnt(0) 4811; GFX9-NEXT: buffer_wbinvl1_vol 4812; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 4813; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4814; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4815; GFX9-NEXT: s_cbranch_execnz .LBB92_1 4816; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4817; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4818; GFX9-NEXT: v_mov_b32_e32 v1, 0 4819; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 4820; GFX9-NEXT: s_endpgm 4821entry: 4822 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 4823 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 4824 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst 4825 store i32 %tmp0, ptr addrspace(1) %out2 4826 ret void 4827} 4828 4829define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { 4830; SI-LABEL: atomic_max_i32_addr64: 4831; SI: ; %bb.0: ; %entry 4832; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4833; SI-NEXT: s_waitcnt lgkmcnt(0) 4834; SI-NEXT: s_ashr_i32 s5, s3, 31 4835; SI-NEXT: s_mov_b32 s4, s3 4836; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4837; SI-NEXT: s_add_u32 s4, s0, s4 4838; SI-NEXT: s_addc_u32 s5, s1, s5 4839; SI-NEXT: s_load_dword s3, s[4:5], 0x0 4840; SI-NEXT: s_mov_b64 s[0:1], 0 4841; SI-NEXT: s_mov_b32 s7, 0xf000 4842; SI-NEXT: s_waitcnt lgkmcnt(0) 4843; SI-NEXT: v_mov_b32_e32 v1, s3 4844; SI-NEXT: s_mov_b32 s6, -1 4845; SI-NEXT: .LBB93_1: ; %atomicrmw.start 4846; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4847; SI-NEXT: v_max_i32_e32 v0, s2, v1 4848; SI-NEXT: s_waitcnt expcnt(0) 4849; SI-NEXT: v_mov_b32_e32 v3, v1 4850; SI-NEXT: v_mov_b32_e32 v2, v0 4851; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 4852; SI-NEXT: s_waitcnt vmcnt(0) 4853; SI-NEXT: buffer_wbinvl1 4854; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4855; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4856; SI-NEXT: v_mov_b32_e32 v1, v2 4857; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4858; SI-NEXT: s_cbranch_execnz .LBB93_1 4859; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4860; SI-NEXT: s_endpgm 4861; 4862; VI-LABEL: atomic_max_i32_addr64: 4863; VI: ; %bb.0: ; %entry 4864; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4865; VI-NEXT: s_waitcnt lgkmcnt(0) 4866; VI-NEXT: s_ashr_i32 s5, s3, 31 4867; VI-NEXT: s_mov_b32 s4, s3 4868; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4869; VI-NEXT: s_add_u32 s4, s0, s4 4870; VI-NEXT: s_addc_u32 s5, s1, s5 4871; VI-NEXT: s_load_dword s3, s[4:5], 0x0 4872; VI-NEXT: v_mov_b32_e32 v0, s4 4873; VI-NEXT: s_mov_b64 s[0:1], 0 4874; VI-NEXT: v_mov_b32_e32 v1, s5 4875; VI-NEXT: s_waitcnt lgkmcnt(0) 4876; VI-NEXT: v_mov_b32_e32 v3, s3 4877; VI-NEXT: .LBB93_1: ; %atomicrmw.start 4878; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4879; VI-NEXT: v_max_i32_e32 v2, s2, v3 4880; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4881; VI-NEXT: s_waitcnt vmcnt(0) 4882; VI-NEXT: buffer_wbinvl1_vol 4883; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4884; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4885; VI-NEXT: v_mov_b32_e32 v3, v2 4886; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4887; VI-NEXT: s_cbranch_execnz .LBB93_1 4888; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4889; VI-NEXT: s_endpgm 4890; 4891; GFX9-LABEL: atomic_max_i32_addr64: 4892; GFX9: ; %bb.0: ; %entry 4893; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4894; GFX9-NEXT: v_mov_b32_e32 v2, 0 4895; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4896; GFX9-NEXT: s_ashr_i32 s5, s3, 31 4897; GFX9-NEXT: s_mov_b32 s4, s3 4898; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4899; GFX9-NEXT: s_add_u32 s0, s0, s4 4900; GFX9-NEXT: s_addc_u32 s1, s1, s5 4901; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 4902; GFX9-NEXT: s_mov_b64 s[4:5], 0 4903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4904; GFX9-NEXT: v_mov_b32_e32 v1, s3 4905; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start 4906; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4907; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 4908; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 4909; GFX9-NEXT: s_waitcnt vmcnt(0) 4910; GFX9-NEXT: buffer_wbinvl1_vol 4911; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4912; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4913; GFX9-NEXT: v_mov_b32_e32 v1, v0 4914; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4915; GFX9-NEXT: s_cbranch_execnz .LBB93_1 4916; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4917; GFX9-NEXT: s_endpgm 4918entry: 4919 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 4920 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst 4921 ret void 4922} 4923 4924define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { 4925; SI-LABEL: atomic_max_i32_ret_addr64: 4926; SI: ; %bb.0: ; %entry 4927; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 4928; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4929; SI-NEXT: s_waitcnt lgkmcnt(0) 4930; SI-NEXT: s_ashr_i32 s5, s9, 31 4931; SI-NEXT: s_mov_b32 s4, s9 4932; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4933; SI-NEXT: s_add_u32 s4, s0, s4 4934; SI-NEXT: s_addc_u32 s5, s1, s5 4935; SI-NEXT: s_load_dword s6, s[4:5], 0x0 4936; SI-NEXT: s_mov_b64 s[0:1], 0 4937; SI-NEXT: s_mov_b32 s7, 0xf000 4938; SI-NEXT: s_waitcnt lgkmcnt(0) 4939; SI-NEXT: v_mov_b32_e32 v1, s6 4940; SI-NEXT: s_mov_b32 s6, -1 4941; SI-NEXT: .LBB94_1: ; %atomicrmw.start 4942; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4943; SI-NEXT: v_max_i32_e32 v0, s8, v1 4944; SI-NEXT: s_waitcnt expcnt(0) 4945; SI-NEXT: v_mov_b32_e32 v3, v1 4946; SI-NEXT: v_mov_b32_e32 v2, v0 4947; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 4948; SI-NEXT: s_waitcnt vmcnt(0) 4949; SI-NEXT: buffer_wbinvl1 4950; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4951; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4952; SI-NEXT: v_mov_b32_e32 v1, v2 4953; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4954; SI-NEXT: s_cbranch_execnz .LBB94_1 4955; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4956; SI-NEXT: s_or_b64 exec, exec, s[0:1] 4957; SI-NEXT: s_mov_b32 s7, 0xf000 4958; SI-NEXT: s_mov_b32 s6, -1 4959; SI-NEXT: s_mov_b32 s4, s2 4960; SI-NEXT: s_mov_b32 s5, s3 4961; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 4962; SI-NEXT: s_endpgm 4963; 4964; VI-LABEL: atomic_max_i32_ret_addr64: 4965; VI: ; %bb.0: ; %entry 4966; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4967; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4968; VI-NEXT: s_waitcnt lgkmcnt(0) 4969; VI-NEXT: s_ashr_i32 s5, s7, 31 4970; VI-NEXT: s_mov_b32 s4, s7 4971; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4972; VI-NEXT: s_add_u32 s4, s0, s4 4973; VI-NEXT: s_addc_u32 s5, s1, s5 4974; VI-NEXT: s_load_dword s7, s[4:5], 0x0 4975; VI-NEXT: v_mov_b32_e32 v0, s4 4976; VI-NEXT: s_mov_b64 s[0:1], 0 4977; VI-NEXT: v_mov_b32_e32 v1, s5 4978; VI-NEXT: s_waitcnt lgkmcnt(0) 4979; VI-NEXT: v_mov_b32_e32 v2, s7 4980; VI-NEXT: .LBB94_1: ; %atomicrmw.start 4981; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4982; VI-NEXT: v_mov_b32_e32 v3, v2 4983; VI-NEXT: v_max_i32_e32 v2, s6, v3 4984; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4985; VI-NEXT: s_waitcnt vmcnt(0) 4986; VI-NEXT: buffer_wbinvl1_vol 4987; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4988; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4989; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4990; VI-NEXT: s_cbranch_execnz .LBB94_1 4991; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4992; VI-NEXT: s_or_b64 exec, exec, s[0:1] 4993; VI-NEXT: v_mov_b32_e32 v0, s2 4994; VI-NEXT: v_mov_b32_e32 v1, s3 4995; VI-NEXT: flat_store_dword v[0:1], v2 4996; VI-NEXT: s_endpgm 4997; 4998; GFX9-LABEL: atomic_max_i32_ret_addr64: 4999; GFX9: ; %bb.0: ; %entry 5000; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5001; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5002; GFX9-NEXT: v_mov_b32_e32 v1, 0 5003; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5004; GFX9-NEXT: s_ashr_i32 s5, s7, 31 5005; GFX9-NEXT: s_mov_b32 s4, s7 5006; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5007; GFX9-NEXT: s_add_u32 s0, s0, s4 5008; GFX9-NEXT: s_addc_u32 s1, s1, s5 5009; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 5010; GFX9-NEXT: s_mov_b64 s[4:5], 0 5011; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5012; GFX9-NEXT: v_mov_b32_e32 v0, s7 5013; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start 5014; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5015; GFX9-NEXT: v_mov_b32_e32 v3, v0 5016; GFX9-NEXT: v_max_i32_e32 v2, s6, v3 5017; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc 5018; GFX9-NEXT: s_waitcnt vmcnt(0) 5019; GFX9-NEXT: buffer_wbinvl1_vol 5020; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 5021; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5022; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5023; GFX9-NEXT: s_cbranch_execnz .LBB94_1 5024; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5025; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5026; GFX9-NEXT: v_mov_b32_e32 v1, 0 5027; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 5028; GFX9-NEXT: s_endpgm 5029entry: 5030 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 5031 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst 5032 store i32 %tmp0, ptr addrspace(1) %out2 5033 ret void 5034} 5035 5036define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 5037; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory: 5038; SI: ; %bb.0: 5039; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5040; SI-NEXT: s_mov_b32 s6, 0 5041; SI-NEXT: s_mov_b32 s7, 0xf000 5042; SI-NEXT: s_mov_b32 s4, s6 5043; SI-NEXT: s_mov_b32 s5, s6 5044; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 5045; SI-NEXT: s_mov_b64 s[8:9], 0 5046; SI-NEXT: .LBB95_1: ; %atomicrmw.start 5047; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5048; SI-NEXT: s_waitcnt vmcnt(0) 5049; SI-NEXT: v_max_i32_e32 v3, v4, v2 5050; SI-NEXT: s_waitcnt expcnt(0) 5051; SI-NEXT: v_mov_b32_e32 v6, v4 5052; SI-NEXT: v_mov_b32_e32 v5, v3 5053; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 5054; SI-NEXT: s_waitcnt vmcnt(0) 5055; SI-NEXT: buffer_wbinvl1 5056; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 5057; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5058; SI-NEXT: v_mov_b32_e32 v4, v5 5059; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5060; SI-NEXT: s_cbranch_execnz .LBB95_1 5061; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5062; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5063; SI-NEXT: s_waitcnt expcnt(0) 5064; SI-NEXT: s_setpc_b64 s[30:31] 5065; 5066; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory: 5067; VI: ; %bb.0: 5068; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5069; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 5070; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5071; VI-NEXT: flat_load_dword v4, v[0:1] 5072; VI-NEXT: s_mov_b64 s[4:5], 0 5073; VI-NEXT: .LBB95_1: ; %atomicrmw.start 5074; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5075; VI-NEXT: s_waitcnt vmcnt(0) 5076; VI-NEXT: v_max_i32_e32 v3, v4, v2 5077; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5078; VI-NEXT: s_waitcnt vmcnt(0) 5079; VI-NEXT: buffer_wbinvl1_vol 5080; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5081; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5082; VI-NEXT: v_mov_b32_e32 v4, v3 5083; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5084; VI-NEXT: s_cbranch_execnz .LBB95_1 5085; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5086; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5087; VI-NEXT: s_setpc_b64 s[30:31] 5088; 5089; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory: 5090; GFX9: ; %bb.0: 5091; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5092; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 5093; GFX9-NEXT: s_mov_b64 s[4:5], 0 5094; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start 5095; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5096; GFX9-NEXT: s_waitcnt vmcnt(0) 5097; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 5098; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 5099; GFX9-NEXT: s_waitcnt vmcnt(0) 5100; GFX9-NEXT: buffer_wbinvl1_vol 5101; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5102; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5103; GFX9-NEXT: v_mov_b32_e32 v4, v3 5104; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5105; GFX9-NEXT: s_cbranch_execnz .LBB95_1 5106; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5107; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5108; GFX9-NEXT: s_setpc_b64 s[30:31] 5109 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 5110 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 5111 ret void 5112} 5113 5114define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 5115; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: 5116; SI: ; %bb.0: 5117; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5118; SI-NEXT: s_mov_b32 s6, 0 5119; SI-NEXT: s_mov_b32 s7, 0xf000 5120; SI-NEXT: s_mov_b32 s4, s6 5121; SI-NEXT: s_mov_b32 s5, s6 5122; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 5123; SI-NEXT: s_mov_b64 s[8:9], 0 5124; SI-NEXT: .LBB96_1: ; %atomicrmw.start 5125; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5126; SI-NEXT: s_waitcnt vmcnt(0) 5127; SI-NEXT: v_mov_b32_e32 v5, v3 5128; SI-NEXT: s_waitcnt expcnt(0) 5129; SI-NEXT: v_max_i32_e32 v4, v5, v2 5130; SI-NEXT: v_mov_b32_e32 v3, v4 5131; SI-NEXT: v_mov_b32_e32 v4, v5 5132; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 5133; SI-NEXT: s_waitcnt vmcnt(0) 5134; SI-NEXT: buffer_wbinvl1 5135; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 5136; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5137; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5138; SI-NEXT: s_cbranch_execnz .LBB96_1 5139; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5140; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5141; SI-NEXT: v_mov_b32_e32 v0, v3 5142; SI-NEXT: s_waitcnt expcnt(0) 5143; SI-NEXT: s_setpc_b64 s[30:31] 5144; 5145; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: 5146; VI: ; %bb.0: 5147; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5148; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 5149; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5150; VI-NEXT: flat_load_dword v0, v[3:4] 5151; VI-NEXT: s_mov_b64 s[4:5], 0 5152; VI-NEXT: .LBB96_1: ; %atomicrmw.start 5153; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5154; VI-NEXT: s_waitcnt vmcnt(0) 5155; VI-NEXT: v_mov_b32_e32 v1, v0 5156; VI-NEXT: v_max_i32_e32 v0, v1, v2 5157; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 5158; VI-NEXT: s_waitcnt vmcnt(0) 5159; VI-NEXT: buffer_wbinvl1_vol 5160; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5161; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5162; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5163; VI-NEXT: s_cbranch_execnz .LBB96_1 5164; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5165; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5166; VI-NEXT: s_setpc_b64 s[30:31] 5167; 5168; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: 5169; GFX9: ; %bb.0: 5170; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5171; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 5172; GFX9-NEXT: s_mov_b64 s[4:5], 0 5173; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start 5174; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5175; GFX9-NEXT: s_waitcnt vmcnt(0) 5176; GFX9-NEXT: v_mov_b32_e32 v4, v3 5177; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 5178; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 5179; GFX9-NEXT: s_waitcnt vmcnt(0) 5180; GFX9-NEXT: buffer_wbinvl1_vol 5181; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5182; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5183; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5184; GFX9-NEXT: s_cbranch_execnz .LBB96_1 5185; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5186; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5187; GFX9-NEXT: v_mov_b32_e32 v0, v3 5188; GFX9-NEXT: s_setpc_b64 s[30:31] 5189 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 5190 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 5191 ret i32 %result 5192} 5193 5194; --------------------------------------------------------------------- 5195; atomicrmw umax 5196; --------------------------------------------------------------------- 5197 5198define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 5199; SI-LABEL: global_atomic_umax_i32_noret: 5200; SI: ; %bb.0: 5201; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5202; SI-NEXT: s_mov_b32 s6, 0 5203; SI-NEXT: s_mov_b32 s7, 0xf000 5204; SI-NEXT: s_mov_b32 s4, s6 5205; SI-NEXT: s_mov_b32 s5, s6 5206; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 5207; SI-NEXT: s_mov_b64 s[8:9], 0 5208; SI-NEXT: .LBB97_1: ; %atomicrmw.start 5209; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5210; SI-NEXT: s_waitcnt vmcnt(0) 5211; SI-NEXT: v_max_u32_e32 v3, v4, v2 5212; SI-NEXT: s_waitcnt expcnt(0) 5213; SI-NEXT: v_mov_b32_e32 v6, v4 5214; SI-NEXT: v_mov_b32_e32 v5, v3 5215; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc 5216; SI-NEXT: s_waitcnt vmcnt(0) 5217; SI-NEXT: buffer_wbinvl1 5218; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 5219; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5220; SI-NEXT: v_mov_b32_e32 v4, v5 5221; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5222; SI-NEXT: s_cbranch_execnz .LBB97_1 5223; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5224; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5225; SI-NEXT: s_waitcnt expcnt(0) 5226; SI-NEXT: s_setpc_b64 s[30:31] 5227; 5228; VI-LABEL: global_atomic_umax_i32_noret: 5229; VI: ; %bb.0: 5230; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5231; VI-NEXT: flat_load_dword v4, v[0:1] 5232; VI-NEXT: s_mov_b64 s[4:5], 0 5233; VI-NEXT: .LBB97_1: ; %atomicrmw.start 5234; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5235; VI-NEXT: s_waitcnt vmcnt(0) 5236; VI-NEXT: v_max_u32_e32 v3, v4, v2 5237; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5238; VI-NEXT: s_waitcnt vmcnt(0) 5239; VI-NEXT: buffer_wbinvl1_vol 5240; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5241; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5242; VI-NEXT: v_mov_b32_e32 v4, v3 5243; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5244; VI-NEXT: s_cbranch_execnz .LBB97_1 5245; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5246; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5247; VI-NEXT: s_setpc_b64 s[30:31] 5248; 5249; GFX9-LABEL: global_atomic_umax_i32_noret: 5250; GFX9: ; %bb.0: 5251; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5252; GFX9-NEXT: global_load_dword v4, v[0:1], off 5253; GFX9-NEXT: s_mov_b64 s[4:5], 0 5254; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start 5255; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5256; GFX9-NEXT: s_waitcnt vmcnt(0) 5257; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 5258; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 5259; GFX9-NEXT: s_waitcnt vmcnt(0) 5260; GFX9-NEXT: buffer_wbinvl1_vol 5261; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5262; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5263; GFX9-NEXT: v_mov_b32_e32 v4, v3 5264; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5265; GFX9-NEXT: s_cbranch_execnz .LBB97_1 5266; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5267; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5268; GFX9-NEXT: s_setpc_b64 s[30:31] 5269 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst 5270 ret void 5271} 5272 5273define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 5274; SI-LABEL: global_atomic_umax_i32_noret_offset: 5275; SI: ; %bb.0: 5276; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5277; SI-NEXT: s_mov_b32 s6, 0 5278; SI-NEXT: s_mov_b32 s7, 0xf000 5279; SI-NEXT: s_mov_b32 s4, s6 5280; SI-NEXT: s_mov_b32 s5, s6 5281; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 5282; SI-NEXT: s_mov_b64 s[8:9], 0 5283; SI-NEXT: .LBB98_1: ; %atomicrmw.start 5284; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5285; SI-NEXT: s_waitcnt vmcnt(0) 5286; SI-NEXT: v_max_u32_e32 v3, v4, v2 5287; SI-NEXT: s_waitcnt expcnt(0) 5288; SI-NEXT: v_mov_b32_e32 v6, v4 5289; SI-NEXT: v_mov_b32_e32 v5, v3 5290; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 5291; SI-NEXT: s_waitcnt vmcnt(0) 5292; SI-NEXT: buffer_wbinvl1 5293; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 5294; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5295; SI-NEXT: v_mov_b32_e32 v4, v5 5296; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5297; SI-NEXT: s_cbranch_execnz .LBB98_1 5298; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5299; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5300; SI-NEXT: s_waitcnt expcnt(0) 5301; SI-NEXT: s_setpc_b64 s[30:31] 5302; 5303; VI-LABEL: global_atomic_umax_i32_noret_offset: 5304; VI: ; %bb.0: 5305; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5306; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 5307; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5308; VI-NEXT: flat_load_dword v4, v[0:1] 5309; VI-NEXT: s_mov_b64 s[4:5], 0 5310; VI-NEXT: .LBB98_1: ; %atomicrmw.start 5311; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5312; VI-NEXT: s_waitcnt vmcnt(0) 5313; VI-NEXT: v_max_u32_e32 v3, v4, v2 5314; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5315; VI-NEXT: s_waitcnt vmcnt(0) 5316; VI-NEXT: buffer_wbinvl1_vol 5317; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5318; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5319; VI-NEXT: v_mov_b32_e32 v4, v3 5320; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5321; VI-NEXT: s_cbranch_execnz .LBB98_1 5322; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5323; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5324; VI-NEXT: s_setpc_b64 s[30:31] 5325; 5326; GFX9-LABEL: global_atomic_umax_i32_noret_offset: 5327; GFX9: ; %bb.0: 5328; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5329; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 5330; GFX9-NEXT: s_mov_b64 s[4:5], 0 5331; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start 5332; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5333; GFX9-NEXT: s_waitcnt vmcnt(0) 5334; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 5335; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 5336; GFX9-NEXT: s_waitcnt vmcnt(0) 5337; GFX9-NEXT: buffer_wbinvl1_vol 5338; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5339; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5340; GFX9-NEXT: v_mov_b32_e32 v4, v3 5341; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5342; GFX9-NEXT: s_cbranch_execnz .LBB98_1 5343; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5344; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5345; GFX9-NEXT: s_setpc_b64 s[30:31] 5346 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 5347 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst 5348 ret void 5349} 5350 5351define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 5352; SI-LABEL: global_atomic_umax_i32_ret: 5353; SI: ; %bb.0: 5354; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5355; SI-NEXT: s_mov_b32 s6, 0 5356; SI-NEXT: s_mov_b32 s7, 0xf000 5357; SI-NEXT: s_mov_b32 s4, s6 5358; SI-NEXT: s_mov_b32 s5, s6 5359; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 5360; SI-NEXT: s_mov_b64 s[8:9], 0 5361; SI-NEXT: .LBB99_1: ; %atomicrmw.start 5362; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5363; SI-NEXT: s_waitcnt vmcnt(0) 5364; SI-NEXT: v_mov_b32_e32 v5, v3 5365; SI-NEXT: s_waitcnt expcnt(0) 5366; SI-NEXT: v_max_u32_e32 v4, v5, v2 5367; SI-NEXT: v_mov_b32_e32 v3, v4 5368; SI-NEXT: v_mov_b32_e32 v4, v5 5369; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc 5370; SI-NEXT: s_waitcnt vmcnt(0) 5371; SI-NEXT: buffer_wbinvl1 5372; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 5373; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5374; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5375; SI-NEXT: s_cbranch_execnz .LBB99_1 5376; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5377; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5378; SI-NEXT: v_mov_b32_e32 v0, v3 5379; SI-NEXT: s_waitcnt expcnt(0) 5380; SI-NEXT: s_setpc_b64 s[30:31] 5381; 5382; VI-LABEL: global_atomic_umax_i32_ret: 5383; VI: ; %bb.0: 5384; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5385; VI-NEXT: flat_load_dword v3, v[0:1] 5386; VI-NEXT: s_mov_b64 s[4:5], 0 5387; VI-NEXT: .LBB99_1: ; %atomicrmw.start 5388; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5389; VI-NEXT: s_waitcnt vmcnt(0) 5390; VI-NEXT: v_mov_b32_e32 v4, v3 5391; VI-NEXT: v_max_u32_e32 v3, v4, v2 5392; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5393; VI-NEXT: s_waitcnt vmcnt(0) 5394; VI-NEXT: buffer_wbinvl1_vol 5395; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5396; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5397; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5398; VI-NEXT: s_cbranch_execnz .LBB99_1 5399; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5400; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5401; VI-NEXT: v_mov_b32_e32 v0, v3 5402; VI-NEXT: s_setpc_b64 s[30:31] 5403; 5404; GFX9-LABEL: global_atomic_umax_i32_ret: 5405; GFX9: ; %bb.0: 5406; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5407; GFX9-NEXT: global_load_dword v3, v[0:1], off 5408; GFX9-NEXT: s_mov_b64 s[4:5], 0 5409; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start 5410; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5411; GFX9-NEXT: s_waitcnt vmcnt(0) 5412; GFX9-NEXT: v_mov_b32_e32 v4, v3 5413; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 5414; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 5415; GFX9-NEXT: s_waitcnt vmcnt(0) 5416; GFX9-NEXT: buffer_wbinvl1_vol 5417; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5418; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5419; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5420; GFX9-NEXT: s_cbranch_execnz .LBB99_1 5421; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5422; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5423; GFX9-NEXT: v_mov_b32_e32 v0, v3 5424; GFX9-NEXT: s_setpc_b64 s[30:31] 5425 %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst 5426 ret i32 %result 5427} 5428 5429define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 5430; SI-LABEL: global_atomic_umax_i32_ret_offset: 5431; SI: ; %bb.0: 5432; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5433; SI-NEXT: s_mov_b32 s6, 0 5434; SI-NEXT: s_mov_b32 s7, 0xf000 5435; SI-NEXT: s_mov_b32 s4, s6 5436; SI-NEXT: s_mov_b32 s5, s6 5437; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 5438; SI-NEXT: s_mov_b64 s[8:9], 0 5439; SI-NEXT: .LBB100_1: ; %atomicrmw.start 5440; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5441; SI-NEXT: s_waitcnt vmcnt(0) 5442; SI-NEXT: v_mov_b32_e32 v5, v3 5443; SI-NEXT: s_waitcnt expcnt(0) 5444; SI-NEXT: v_max_u32_e32 v4, v5, v2 5445; SI-NEXT: v_mov_b32_e32 v3, v4 5446; SI-NEXT: v_mov_b32_e32 v4, v5 5447; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 5448; SI-NEXT: s_waitcnt vmcnt(0) 5449; SI-NEXT: buffer_wbinvl1 5450; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 5451; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5452; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5453; SI-NEXT: s_cbranch_execnz .LBB100_1 5454; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5455; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5456; SI-NEXT: v_mov_b32_e32 v0, v3 5457; SI-NEXT: s_waitcnt expcnt(0) 5458; SI-NEXT: s_setpc_b64 s[30:31] 5459; 5460; VI-LABEL: global_atomic_umax_i32_ret_offset: 5461; VI: ; %bb.0: 5462; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5463; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 5464; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5465; VI-NEXT: flat_load_dword v0, v[3:4] 5466; VI-NEXT: s_mov_b64 s[4:5], 0 5467; VI-NEXT: .LBB100_1: ; %atomicrmw.start 5468; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5469; VI-NEXT: s_waitcnt vmcnt(0) 5470; VI-NEXT: v_mov_b32_e32 v1, v0 5471; VI-NEXT: v_max_u32_e32 v0, v1, v2 5472; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 5473; VI-NEXT: s_waitcnt vmcnt(0) 5474; VI-NEXT: buffer_wbinvl1_vol 5475; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5476; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5477; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5478; VI-NEXT: s_cbranch_execnz .LBB100_1 5479; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5480; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5481; VI-NEXT: s_setpc_b64 s[30:31] 5482; 5483; GFX9-LABEL: global_atomic_umax_i32_ret_offset: 5484; GFX9: ; %bb.0: 5485; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5486; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 5487; GFX9-NEXT: s_mov_b64 s[4:5], 0 5488; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start 5489; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5490; GFX9-NEXT: s_waitcnt vmcnt(0) 5491; GFX9-NEXT: v_mov_b32_e32 v4, v3 5492; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 5493; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 5494; GFX9-NEXT: s_waitcnt vmcnt(0) 5495; GFX9-NEXT: buffer_wbinvl1_vol 5496; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5497; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5498; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5499; GFX9-NEXT: s_cbranch_execnz .LBB100_1 5500; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5501; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5502; GFX9-NEXT: v_mov_b32_e32 v0, v3 5503; GFX9-NEXT: s_setpc_b64 s[30:31] 5504 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 5505 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst 5506 ret i32 %result 5507} 5508 5509define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 5510; SI-LABEL: global_atomic_umax_i32_noret_scalar: 5511; SI: ; %bb.0: 5512; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5513; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5514; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 5515; SI-NEXT: s_mov_b64 exec, s[34:35] 5516; SI-NEXT: s_waitcnt expcnt(0) 5517; SI-NEXT: v_writelane_b32 v4, s6, 0 5518; SI-NEXT: v_writelane_b32 v4, s7, 1 5519; SI-NEXT: s_mov_b32 s34, s6 5520; SI-NEXT: s_mov_b32 s7, 0xf000 5521; SI-NEXT: s_mov_b32 s6, -1 5522; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 5523; SI-NEXT: s_mov_b64 s[36:37], 0 5524; SI-NEXT: .LBB101_1: ; %atomicrmw.start 5525; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5526; SI-NEXT: s_waitcnt vmcnt(0) 5527; SI-NEXT: v_max_u32_e32 v0, s34, v1 5528; SI-NEXT: s_waitcnt expcnt(0) 5529; SI-NEXT: v_mov_b32_e32 v3, v1 5530; SI-NEXT: v_mov_b32_e32 v2, v0 5531; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 5532; SI-NEXT: s_waitcnt vmcnt(0) 5533; SI-NEXT: buffer_wbinvl1 5534; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5535; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 5536; SI-NEXT: v_mov_b32_e32 v1, v2 5537; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 5538; SI-NEXT: s_cbranch_execnz .LBB101_1 5539; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5540; SI-NEXT: s_or_b64 exec, exec, s[36:37] 5541; SI-NEXT: v_readlane_b32 s7, v4, 1 5542; SI-NEXT: v_readlane_b32 s6, v4, 0 5543; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5544; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 5545; SI-NEXT: s_mov_b64 exec, s[34:35] 5546; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5547; SI-NEXT: s_setpc_b64 s[30:31] 5548; 5549; VI-LABEL: global_atomic_umax_i32_noret_scalar: 5550; VI: ; %bb.0: 5551; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5552; VI-NEXT: v_mov_b32_e32 v0, s4 5553; VI-NEXT: v_mov_b32_e32 v1, s5 5554; VI-NEXT: flat_load_dword v3, v[0:1] 5555; VI-NEXT: s_mov_b64 s[34:35], 0 5556; VI-NEXT: .LBB101_1: ; %atomicrmw.start 5557; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5558; VI-NEXT: s_waitcnt vmcnt(0) 5559; VI-NEXT: v_max_u32_e32 v2, s6, v3 5560; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5561; VI-NEXT: s_waitcnt vmcnt(0) 5562; VI-NEXT: buffer_wbinvl1_vol 5563; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5564; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5565; VI-NEXT: v_mov_b32_e32 v3, v2 5566; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 5567; VI-NEXT: s_cbranch_execnz .LBB101_1 5568; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5569; VI-NEXT: s_or_b64 exec, exec, s[34:35] 5570; VI-NEXT: s_setpc_b64 s[30:31] 5571; 5572; GFX9-LABEL: global_atomic_umax_i32_noret_scalar: 5573; GFX9: ; %bb.0: 5574; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5575; GFX9-NEXT: v_mov_b32_e32 v2, 0 5576; GFX9-NEXT: global_load_dword v1, v2, s[4:5] 5577; GFX9-NEXT: s_mov_b64 s[34:35], 0 5578; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start 5579; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5580; GFX9-NEXT: s_waitcnt vmcnt(0) 5581; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 5582; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc 5583; GFX9-NEXT: s_waitcnt vmcnt(0) 5584; GFX9-NEXT: buffer_wbinvl1_vol 5585; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5586; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5587; GFX9-NEXT: v_mov_b32_e32 v1, v0 5588; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 5589; GFX9-NEXT: s_cbranch_execnz .LBB101_1 5590; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5591; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 5592; GFX9-NEXT: s_setpc_b64 s[30:31] 5593 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst 5594 ret void 5595} 5596 5597define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 5598; SI-LABEL: global_atomic_umax_i32_noret_offset_scalar: 5599; SI: ; %bb.0: 5600; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5601; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5602; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 5603; SI-NEXT: s_mov_b64 exec, s[34:35] 5604; SI-NEXT: s_waitcnt expcnt(0) 5605; SI-NEXT: v_writelane_b32 v4, s6, 0 5606; SI-NEXT: v_writelane_b32 v4, s7, 1 5607; SI-NEXT: s_mov_b32 s34, s6 5608; SI-NEXT: s_mov_b32 s7, 0xf000 5609; SI-NEXT: s_mov_b32 s6, -1 5610; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 5611; SI-NEXT: s_mov_b64 s[36:37], 0 5612; SI-NEXT: .LBB102_1: ; %atomicrmw.start 5613; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5614; SI-NEXT: s_waitcnt vmcnt(0) 5615; SI-NEXT: v_max_u32_e32 v0, s34, v1 5616; SI-NEXT: s_waitcnt expcnt(0) 5617; SI-NEXT: v_mov_b32_e32 v3, v1 5618; SI-NEXT: v_mov_b32_e32 v2, v0 5619; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 5620; SI-NEXT: s_waitcnt vmcnt(0) 5621; SI-NEXT: buffer_wbinvl1 5622; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5623; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 5624; SI-NEXT: v_mov_b32_e32 v1, v2 5625; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 5626; SI-NEXT: s_cbranch_execnz .LBB102_1 5627; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5628; SI-NEXT: s_or_b64 exec, exec, s[36:37] 5629; SI-NEXT: v_readlane_b32 s7, v4, 1 5630; SI-NEXT: v_readlane_b32 s6, v4, 0 5631; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5632; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 5633; SI-NEXT: s_mov_b64 exec, s[34:35] 5634; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5635; SI-NEXT: s_setpc_b64 s[30:31] 5636; 5637; VI-LABEL: global_atomic_umax_i32_noret_offset_scalar: 5638; VI: ; %bb.0: 5639; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5640; VI-NEXT: s_add_u32 s34, s4, 16 5641; VI-NEXT: s_addc_u32 s35, s5, 0 5642; VI-NEXT: v_mov_b32_e32 v0, s34 5643; VI-NEXT: v_mov_b32_e32 v1, s35 5644; VI-NEXT: flat_load_dword v3, v[0:1] 5645; VI-NEXT: s_mov_b64 s[34:35], 0 5646; VI-NEXT: .LBB102_1: ; %atomicrmw.start 5647; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5648; VI-NEXT: s_waitcnt vmcnt(0) 5649; VI-NEXT: v_max_u32_e32 v2, s6, v3 5650; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5651; VI-NEXT: s_waitcnt vmcnt(0) 5652; VI-NEXT: buffer_wbinvl1_vol 5653; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5654; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5655; VI-NEXT: v_mov_b32_e32 v3, v2 5656; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 5657; VI-NEXT: s_cbranch_execnz .LBB102_1 5658; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5659; VI-NEXT: s_or_b64 exec, exec, s[34:35] 5660; VI-NEXT: s_setpc_b64 s[30:31] 5661; 5662; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar: 5663; GFX9: ; %bb.0: 5664; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5665; GFX9-NEXT: v_mov_b32_e32 v2, 0 5666; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 5667; GFX9-NEXT: s_mov_b64 s[34:35], 0 5668; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start 5669; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5670; GFX9-NEXT: s_waitcnt vmcnt(0) 5671; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 5672; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc 5673; GFX9-NEXT: s_waitcnt vmcnt(0) 5674; GFX9-NEXT: buffer_wbinvl1_vol 5675; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5676; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5677; GFX9-NEXT: v_mov_b32_e32 v1, v0 5678; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 5679; GFX9-NEXT: s_cbranch_execnz .LBB102_1 5680; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5681; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 5682; GFX9-NEXT: s_setpc_b64 s[30:31] 5683 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 5684 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst 5685 ret void 5686} 5687 5688define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 5689; SI-LABEL: global_atomic_umax_i32_ret_scalar: 5690; SI: ; %bb.0: 5691; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5692; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5693; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 5694; SI-NEXT: s_mov_b64 exec, s[34:35] 5695; SI-NEXT: s_waitcnt expcnt(0) 5696; SI-NEXT: v_writelane_b32 v3, s6, 0 5697; SI-NEXT: v_writelane_b32 v3, s7, 1 5698; SI-NEXT: s_mov_b32 s34, s6 5699; SI-NEXT: s_mov_b32 s7, 0xf000 5700; SI-NEXT: s_mov_b32 s6, -1 5701; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 5702; SI-NEXT: s_mov_b64 s[36:37], 0 5703; SI-NEXT: .LBB103_1: ; %atomicrmw.start 5704; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5705; SI-NEXT: s_waitcnt vmcnt(0) 5706; SI-NEXT: v_mov_b32_e32 v2, v0 5707; SI-NEXT: s_waitcnt expcnt(0) 5708; SI-NEXT: v_max_u32_e32 v1, s34, v2 5709; SI-NEXT: v_mov_b32_e32 v0, v1 5710; SI-NEXT: v_mov_b32_e32 v1, v2 5711; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc 5712; SI-NEXT: s_waitcnt vmcnt(0) 5713; SI-NEXT: buffer_wbinvl1 5714; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 5715; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 5716; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 5717; SI-NEXT: s_cbranch_execnz .LBB103_1 5718; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5719; SI-NEXT: s_or_b64 exec, exec, s[36:37] 5720; SI-NEXT: v_readlane_b32 s7, v3, 1 5721; SI-NEXT: v_readlane_b32 s6, v3, 0 5722; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5723; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 5724; SI-NEXT: s_mov_b64 exec, s[34:35] 5725; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5726; SI-NEXT: s_setpc_b64 s[30:31] 5727; 5728; VI-LABEL: global_atomic_umax_i32_ret_scalar: 5729; VI: ; %bb.0: 5730; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5731; VI-NEXT: v_mov_b32_e32 v0, s4 5732; VI-NEXT: v_mov_b32_e32 v1, s5 5733; VI-NEXT: flat_load_dword v0, v[0:1] 5734; VI-NEXT: v_mov_b32_e32 v1, s4 5735; VI-NEXT: s_mov_b64 s[34:35], 0 5736; VI-NEXT: v_mov_b32_e32 v2, s5 5737; VI-NEXT: .LBB103_1: ; %atomicrmw.start 5738; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5739; VI-NEXT: s_waitcnt vmcnt(0) 5740; VI-NEXT: v_mov_b32_e32 v4, v0 5741; VI-NEXT: v_max_u32_e32 v3, s6, v4 5742; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5743; VI-NEXT: s_waitcnt vmcnt(0) 5744; VI-NEXT: buffer_wbinvl1_vol 5745; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5746; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5747; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 5748; VI-NEXT: s_cbranch_execnz .LBB103_1 5749; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5750; VI-NEXT: s_or_b64 exec, exec, s[34:35] 5751; VI-NEXT: s_setpc_b64 s[30:31] 5752; 5753; GFX9-LABEL: global_atomic_umax_i32_ret_scalar: 5754; GFX9: ; %bb.0: 5755; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5756; GFX9-NEXT: v_mov_b32_e32 v1, 0 5757; GFX9-NEXT: global_load_dword v0, v1, s[4:5] 5758; GFX9-NEXT: s_mov_b64 s[34:35], 0 5759; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start 5760; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5761; GFX9-NEXT: s_waitcnt vmcnt(0) 5762; GFX9-NEXT: v_mov_b32_e32 v3, v0 5763; GFX9-NEXT: v_max_u32_e32 v2, s6, v3 5764; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc 5765; GFX9-NEXT: s_waitcnt vmcnt(0) 5766; GFX9-NEXT: buffer_wbinvl1_vol 5767; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 5768; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5769; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 5770; GFX9-NEXT: s_cbranch_execnz .LBB103_1 5771; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5772; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 5773; GFX9-NEXT: s_setpc_b64 s[30:31] 5774 %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst 5775 ret i32 %result 5776} 5777 5778define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 5779; SI-LABEL: global_atomic_umax_i32_ret_offset_scalar: 5780; SI: ; %bb.0: 5781; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5782; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5783; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 5784; SI-NEXT: s_mov_b64 exec, s[34:35] 5785; SI-NEXT: s_waitcnt expcnt(0) 5786; SI-NEXT: v_writelane_b32 v3, s6, 0 5787; SI-NEXT: v_writelane_b32 v3, s7, 1 5788; SI-NEXT: s_mov_b32 s34, s6 5789; SI-NEXT: s_mov_b32 s7, 0xf000 5790; SI-NEXT: s_mov_b32 s6, -1 5791; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 5792; SI-NEXT: s_mov_b64 s[36:37], 0 5793; SI-NEXT: .LBB104_1: ; %atomicrmw.start 5794; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5795; SI-NEXT: s_waitcnt vmcnt(0) 5796; SI-NEXT: v_mov_b32_e32 v2, v0 5797; SI-NEXT: s_waitcnt expcnt(0) 5798; SI-NEXT: v_max_u32_e32 v1, s34, v2 5799; SI-NEXT: v_mov_b32_e32 v0, v1 5800; SI-NEXT: v_mov_b32_e32 v1, v2 5801; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 5802; SI-NEXT: s_waitcnt vmcnt(0) 5803; SI-NEXT: buffer_wbinvl1 5804; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 5805; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 5806; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 5807; SI-NEXT: s_cbranch_execnz .LBB104_1 5808; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5809; SI-NEXT: s_or_b64 exec, exec, s[36:37] 5810; SI-NEXT: v_readlane_b32 s7, v3, 1 5811; SI-NEXT: v_readlane_b32 s6, v3, 0 5812; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5813; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 5814; SI-NEXT: s_mov_b64 exec, s[34:35] 5815; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5816; SI-NEXT: s_setpc_b64 s[30:31] 5817; 5818; VI-LABEL: global_atomic_umax_i32_ret_offset_scalar: 5819; VI: ; %bb.0: 5820; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5821; VI-NEXT: s_add_u32 s34, s4, 16 5822; VI-NEXT: s_addc_u32 s35, s5, 0 5823; VI-NEXT: v_mov_b32_e32 v1, s34 5824; VI-NEXT: v_mov_b32_e32 v2, s35 5825; VI-NEXT: flat_load_dword v0, v[1:2] 5826; VI-NEXT: s_mov_b64 s[34:35], 0 5827; VI-NEXT: .LBB104_1: ; %atomicrmw.start 5828; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5829; VI-NEXT: s_waitcnt vmcnt(0) 5830; VI-NEXT: v_mov_b32_e32 v4, v0 5831; VI-NEXT: v_max_u32_e32 v3, s6, v4 5832; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5833; VI-NEXT: s_waitcnt vmcnt(0) 5834; VI-NEXT: buffer_wbinvl1_vol 5835; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5836; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5837; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 5838; VI-NEXT: s_cbranch_execnz .LBB104_1 5839; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5840; VI-NEXT: s_or_b64 exec, exec, s[34:35] 5841; VI-NEXT: s_setpc_b64 s[30:31] 5842; 5843; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar: 5844; GFX9: ; %bb.0: 5845; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5846; GFX9-NEXT: v_mov_b32_e32 v1, 0 5847; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 5848; GFX9-NEXT: s_mov_b64 s[34:35], 0 5849; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start 5850; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5851; GFX9-NEXT: s_waitcnt vmcnt(0) 5852; GFX9-NEXT: v_mov_b32_e32 v3, v0 5853; GFX9-NEXT: v_max_u32_e32 v2, s6, v3 5854; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc 5855; GFX9-NEXT: s_waitcnt vmcnt(0) 5856; GFX9-NEXT: buffer_wbinvl1_vol 5857; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 5858; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5859; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 5860; GFX9-NEXT: s_cbranch_execnz .LBB104_1 5861; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5862; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 5863; GFX9-NEXT: s_setpc_b64 s[30:31] 5864 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 5865 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst 5866 ret i32 %result 5867} 5868 5869define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { 5870; SI-LABEL: atomic_umax_i32_addr64_offset: 5871; SI: ; %bb.0: ; %entry 5872; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5873; SI-NEXT: s_waitcnt lgkmcnt(0) 5874; SI-NEXT: s_ashr_i32 s5, s3, 31 5875; SI-NEXT: s_mov_b32 s4, s3 5876; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5877; SI-NEXT: s_add_u32 s4, s0, s4 5878; SI-NEXT: s_addc_u32 s5, s1, s5 5879; SI-NEXT: s_load_dword s3, s[4:5], 0x4 5880; SI-NEXT: s_mov_b64 s[0:1], 0 5881; SI-NEXT: s_mov_b32 s7, 0xf000 5882; SI-NEXT: s_waitcnt lgkmcnt(0) 5883; SI-NEXT: v_mov_b32_e32 v1, s3 5884; SI-NEXT: s_mov_b32 s6, -1 5885; SI-NEXT: .LBB105_1: ; %atomicrmw.start 5886; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5887; SI-NEXT: v_max_u32_e32 v0, s2, v1 5888; SI-NEXT: s_waitcnt expcnt(0) 5889; SI-NEXT: v_mov_b32_e32 v3, v1 5890; SI-NEXT: v_mov_b32_e32 v2, v0 5891; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 5892; SI-NEXT: s_waitcnt vmcnt(0) 5893; SI-NEXT: buffer_wbinvl1 5894; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5895; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5896; SI-NEXT: v_mov_b32_e32 v1, v2 5897; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5898; SI-NEXT: s_cbranch_execnz .LBB105_1 5899; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5900; SI-NEXT: s_endpgm 5901; 5902; VI-LABEL: atomic_umax_i32_addr64_offset: 5903; VI: ; %bb.0: ; %entry 5904; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5905; VI-NEXT: s_waitcnt lgkmcnt(0) 5906; VI-NEXT: s_ashr_i32 s5, s3, 31 5907; VI-NEXT: s_mov_b32 s4, s3 5908; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5909; VI-NEXT: s_add_u32 s4, s0, s4 5910; VI-NEXT: s_addc_u32 s5, s1, s5 5911; VI-NEXT: s_load_dword s3, s[4:5], 0x10 5912; VI-NEXT: s_add_u32 s4, s4, 16 5913; VI-NEXT: s_addc_u32 s5, s5, 0 5914; VI-NEXT: v_mov_b32_e32 v0, s4 5915; VI-NEXT: s_mov_b64 s[0:1], 0 5916; VI-NEXT: s_waitcnt lgkmcnt(0) 5917; VI-NEXT: v_mov_b32_e32 v3, s3 5918; VI-NEXT: v_mov_b32_e32 v1, s5 5919; VI-NEXT: .LBB105_1: ; %atomicrmw.start 5920; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5921; VI-NEXT: v_max_u32_e32 v2, s2, v3 5922; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5923; VI-NEXT: s_waitcnt vmcnt(0) 5924; VI-NEXT: buffer_wbinvl1_vol 5925; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5926; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5927; VI-NEXT: v_mov_b32_e32 v3, v2 5928; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5929; VI-NEXT: s_cbranch_execnz .LBB105_1 5930; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5931; VI-NEXT: s_endpgm 5932; 5933; GFX9-LABEL: atomic_umax_i32_addr64_offset: 5934; GFX9: ; %bb.0: ; %entry 5935; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5936; GFX9-NEXT: v_mov_b32_e32 v2, 0 5937; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5938; GFX9-NEXT: s_ashr_i32 s5, s3, 31 5939; GFX9-NEXT: s_mov_b32 s4, s3 5940; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5941; GFX9-NEXT: s_add_u32 s0, s0, s4 5942; GFX9-NEXT: s_addc_u32 s1, s1, s5 5943; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 5944; GFX9-NEXT: s_mov_b64 s[4:5], 0 5945; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5946; GFX9-NEXT: v_mov_b32_e32 v1, s3 5947; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start 5948; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5949; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 5950; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc 5951; GFX9-NEXT: s_waitcnt vmcnt(0) 5952; GFX9-NEXT: buffer_wbinvl1_vol 5953; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5954; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5955; GFX9-NEXT: v_mov_b32_e32 v1, v0 5956; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5957; GFX9-NEXT: s_cbranch_execnz .LBB105_1 5958; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5959; GFX9-NEXT: s_endpgm 5960entry: 5961 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 5962 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 5963 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst 5964 ret void 5965} 5966 5967define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { 5968; SI-LABEL: atomic_umax_i32_ret_addr64_offset: 5969; SI: ; %bb.0: ; %entry 5970; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 5971; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5972; SI-NEXT: s_waitcnt lgkmcnt(0) 5973; SI-NEXT: s_ashr_i32 s5, s9, 31 5974; SI-NEXT: s_mov_b32 s4, s9 5975; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5976; SI-NEXT: s_add_u32 s4, s0, s4 5977; SI-NEXT: s_addc_u32 s5, s1, s5 5978; SI-NEXT: s_load_dword s6, s[4:5], 0x4 5979; SI-NEXT: s_mov_b64 s[0:1], 0 5980; SI-NEXT: s_mov_b32 s7, 0xf000 5981; SI-NEXT: s_waitcnt lgkmcnt(0) 5982; SI-NEXT: v_mov_b32_e32 v1, s6 5983; SI-NEXT: s_mov_b32 s6, -1 5984; SI-NEXT: .LBB106_1: ; %atomicrmw.start 5985; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5986; SI-NEXT: v_max_u32_e32 v0, s8, v1 5987; SI-NEXT: s_waitcnt expcnt(0) 5988; SI-NEXT: v_mov_b32_e32 v3, v1 5989; SI-NEXT: v_mov_b32_e32 v2, v0 5990; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 5991; SI-NEXT: s_waitcnt vmcnt(0) 5992; SI-NEXT: buffer_wbinvl1 5993; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5994; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5995; SI-NEXT: v_mov_b32_e32 v1, v2 5996; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5997; SI-NEXT: s_cbranch_execnz .LBB106_1 5998; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5999; SI-NEXT: s_or_b64 exec, exec, s[0:1] 6000; SI-NEXT: s_mov_b32 s7, 0xf000 6001; SI-NEXT: s_mov_b32 s6, -1 6002; SI-NEXT: s_mov_b32 s4, s2 6003; SI-NEXT: s_mov_b32 s5, s3 6004; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 6005; SI-NEXT: s_endpgm 6006; 6007; VI-LABEL: atomic_umax_i32_ret_addr64_offset: 6008; VI: ; %bb.0: ; %entry 6009; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6010; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6011; VI-NEXT: s_waitcnt lgkmcnt(0) 6012; VI-NEXT: s_ashr_i32 s5, s7, 31 6013; VI-NEXT: s_mov_b32 s4, s7 6014; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6015; VI-NEXT: s_add_u32 s4, s0, s4 6016; VI-NEXT: s_addc_u32 s5, s1, s5 6017; VI-NEXT: s_load_dword s7, s[4:5], 0x10 6018; VI-NEXT: s_add_u32 s4, s4, 16 6019; VI-NEXT: s_addc_u32 s5, s5, 0 6020; VI-NEXT: v_mov_b32_e32 v0, s4 6021; VI-NEXT: s_mov_b64 s[0:1], 0 6022; VI-NEXT: s_waitcnt lgkmcnt(0) 6023; VI-NEXT: v_mov_b32_e32 v2, s7 6024; VI-NEXT: v_mov_b32_e32 v1, s5 6025; VI-NEXT: .LBB106_1: ; %atomicrmw.start 6026; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6027; VI-NEXT: v_mov_b32_e32 v3, v2 6028; VI-NEXT: v_max_u32_e32 v2, s6, v3 6029; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6030; VI-NEXT: s_waitcnt vmcnt(0) 6031; VI-NEXT: buffer_wbinvl1_vol 6032; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6033; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6034; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6035; VI-NEXT: s_cbranch_execnz .LBB106_1 6036; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6037; VI-NEXT: s_or_b64 exec, exec, s[0:1] 6038; VI-NEXT: v_mov_b32_e32 v0, s2 6039; VI-NEXT: v_mov_b32_e32 v1, s3 6040; VI-NEXT: flat_store_dword v[0:1], v2 6041; VI-NEXT: s_endpgm 6042; 6043; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: 6044; GFX9: ; %bb.0: ; %entry 6045; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6046; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6047; GFX9-NEXT: v_mov_b32_e32 v1, 0 6048; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6049; GFX9-NEXT: s_ashr_i32 s5, s7, 31 6050; GFX9-NEXT: s_mov_b32 s4, s7 6051; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6052; GFX9-NEXT: s_add_u32 s0, s0, s4 6053; GFX9-NEXT: s_addc_u32 s1, s1, s5 6054; GFX9-NEXT: s_load_dword s7, s[0:1], 0x10 6055; GFX9-NEXT: s_mov_b64 s[4:5], 0 6056; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6057; GFX9-NEXT: v_mov_b32_e32 v0, s7 6058; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start 6059; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6060; GFX9-NEXT: v_mov_b32_e32 v3, v0 6061; GFX9-NEXT: v_max_u32_e32 v2, s6, v3 6062; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc 6063; GFX9-NEXT: s_waitcnt vmcnt(0) 6064; GFX9-NEXT: buffer_wbinvl1_vol 6065; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 6066; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6067; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6068; GFX9-NEXT: s_cbranch_execnz .LBB106_1 6069; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6070; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6071; GFX9-NEXT: v_mov_b32_e32 v1, 0 6072; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 6073; GFX9-NEXT: s_endpgm 6074entry: 6075 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 6076 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 6077 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst 6078 store i32 %tmp0, ptr addrspace(1) %out2 6079 ret void 6080} 6081 6082define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { 6083; SI-LABEL: atomic_umax_i32_ret_addr64: 6084; SI: ; %bb.0: ; %entry 6085; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 6086; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6087; SI-NEXT: s_waitcnt lgkmcnt(0) 6088; SI-NEXT: s_ashr_i32 s5, s9, 31 6089; SI-NEXT: s_mov_b32 s4, s9 6090; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6091; SI-NEXT: s_add_u32 s4, s0, s4 6092; SI-NEXT: s_addc_u32 s5, s1, s5 6093; SI-NEXT: s_load_dword s6, s[4:5], 0x0 6094; SI-NEXT: s_mov_b64 s[0:1], 0 6095; SI-NEXT: s_mov_b32 s7, 0xf000 6096; SI-NEXT: s_waitcnt lgkmcnt(0) 6097; SI-NEXT: v_mov_b32_e32 v1, s6 6098; SI-NEXT: s_mov_b32 s6, -1 6099; SI-NEXT: .LBB107_1: ; %atomicrmw.start 6100; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6101; SI-NEXT: v_max_u32_e32 v0, s8, v1 6102; SI-NEXT: s_waitcnt expcnt(0) 6103; SI-NEXT: v_mov_b32_e32 v3, v1 6104; SI-NEXT: v_mov_b32_e32 v2, v0 6105; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 6106; SI-NEXT: s_waitcnt vmcnt(0) 6107; SI-NEXT: buffer_wbinvl1 6108; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 6109; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6110; SI-NEXT: v_mov_b32_e32 v1, v2 6111; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6112; SI-NEXT: s_cbranch_execnz .LBB107_1 6113; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6114; SI-NEXT: s_or_b64 exec, exec, s[0:1] 6115; SI-NEXT: s_mov_b32 s7, 0xf000 6116; SI-NEXT: s_mov_b32 s6, -1 6117; SI-NEXT: s_mov_b32 s4, s2 6118; SI-NEXT: s_mov_b32 s5, s3 6119; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 6120; SI-NEXT: s_endpgm 6121; 6122; VI-LABEL: atomic_umax_i32_ret_addr64: 6123; VI: ; %bb.0: ; %entry 6124; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6125; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6126; VI-NEXT: s_waitcnt lgkmcnt(0) 6127; VI-NEXT: s_ashr_i32 s5, s7, 31 6128; VI-NEXT: s_mov_b32 s4, s7 6129; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6130; VI-NEXT: s_add_u32 s4, s0, s4 6131; VI-NEXT: s_addc_u32 s5, s1, s5 6132; VI-NEXT: s_load_dword s7, s[4:5], 0x0 6133; VI-NEXT: v_mov_b32_e32 v0, s4 6134; VI-NEXT: s_mov_b64 s[0:1], 0 6135; VI-NEXT: v_mov_b32_e32 v1, s5 6136; VI-NEXT: s_waitcnt lgkmcnt(0) 6137; VI-NEXT: v_mov_b32_e32 v2, s7 6138; VI-NEXT: .LBB107_1: ; %atomicrmw.start 6139; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6140; VI-NEXT: v_mov_b32_e32 v3, v2 6141; VI-NEXT: v_max_u32_e32 v2, s6, v3 6142; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6143; VI-NEXT: s_waitcnt vmcnt(0) 6144; VI-NEXT: buffer_wbinvl1_vol 6145; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6146; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6147; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6148; VI-NEXT: s_cbranch_execnz .LBB107_1 6149; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6150; VI-NEXT: s_or_b64 exec, exec, s[0:1] 6151; VI-NEXT: v_mov_b32_e32 v0, s2 6152; VI-NEXT: v_mov_b32_e32 v1, s3 6153; VI-NEXT: flat_store_dword v[0:1], v2 6154; VI-NEXT: s_endpgm 6155; 6156; GFX9-LABEL: atomic_umax_i32_ret_addr64: 6157; GFX9: ; %bb.0: ; %entry 6158; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6159; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6160; GFX9-NEXT: v_mov_b32_e32 v1, 0 6161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6162; GFX9-NEXT: s_ashr_i32 s5, s7, 31 6163; GFX9-NEXT: s_mov_b32 s4, s7 6164; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6165; GFX9-NEXT: s_add_u32 s0, s0, s4 6166; GFX9-NEXT: s_addc_u32 s1, s1, s5 6167; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 6168; GFX9-NEXT: s_mov_b64 s[4:5], 0 6169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6170; GFX9-NEXT: v_mov_b32_e32 v0, s7 6171; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start 6172; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6173; GFX9-NEXT: v_mov_b32_e32 v3, v0 6174; GFX9-NEXT: v_max_u32_e32 v2, s6, v3 6175; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc 6176; GFX9-NEXT: s_waitcnt vmcnt(0) 6177; GFX9-NEXT: buffer_wbinvl1_vol 6178; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 6179; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6180; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6181; GFX9-NEXT: s_cbranch_execnz .LBB107_1 6182; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6183; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6184; GFX9-NEXT: v_mov_b32_e32 v1, 0 6185; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 6186; GFX9-NEXT: s_endpgm 6187entry: 6188 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 6189 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst 6190 store i32 %tmp0, ptr addrspace(1) %out2 6191 ret void 6192} 6193 6194define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 6195; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory: 6196; SI: ; %bb.0: 6197; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6198; SI-NEXT: s_mov_b32 s6, 0 6199; SI-NEXT: s_mov_b32 s7, 0xf000 6200; SI-NEXT: s_mov_b32 s4, s6 6201; SI-NEXT: s_mov_b32 s5, s6 6202; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 6203; SI-NEXT: s_mov_b64 s[8:9], 0 6204; SI-NEXT: .LBB108_1: ; %atomicrmw.start 6205; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6206; SI-NEXT: s_waitcnt vmcnt(0) 6207; SI-NEXT: v_max_u32_e32 v3, v4, v2 6208; SI-NEXT: s_waitcnt expcnt(0) 6209; SI-NEXT: v_mov_b32_e32 v6, v4 6210; SI-NEXT: v_mov_b32_e32 v5, v3 6211; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 6212; SI-NEXT: s_waitcnt vmcnt(0) 6213; SI-NEXT: buffer_wbinvl1 6214; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 6215; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6216; SI-NEXT: v_mov_b32_e32 v4, v5 6217; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6218; SI-NEXT: s_cbranch_execnz .LBB108_1 6219; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6220; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6221; SI-NEXT: s_waitcnt expcnt(0) 6222; SI-NEXT: s_setpc_b64 s[30:31] 6223; 6224; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory: 6225; VI: ; %bb.0: 6226; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6227; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 6228; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6229; VI-NEXT: flat_load_dword v4, v[0:1] 6230; VI-NEXT: s_mov_b64 s[4:5], 0 6231; VI-NEXT: .LBB108_1: ; %atomicrmw.start 6232; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6233; VI-NEXT: s_waitcnt vmcnt(0) 6234; VI-NEXT: v_max_u32_e32 v3, v4, v2 6235; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6236; VI-NEXT: s_waitcnt vmcnt(0) 6237; VI-NEXT: buffer_wbinvl1_vol 6238; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6239; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6240; VI-NEXT: v_mov_b32_e32 v4, v3 6241; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6242; VI-NEXT: s_cbranch_execnz .LBB108_1 6243; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6244; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6245; VI-NEXT: s_setpc_b64 s[30:31] 6246; 6247; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory: 6248; GFX9: ; %bb.0: 6249; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6250; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 6251; GFX9-NEXT: s_mov_b64 s[4:5], 0 6252; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start 6253; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6254; GFX9-NEXT: s_waitcnt vmcnt(0) 6255; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 6256; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 6257; GFX9-NEXT: s_waitcnt vmcnt(0) 6258; GFX9-NEXT: buffer_wbinvl1_vol 6259; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6260; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6261; GFX9-NEXT: v_mov_b32_e32 v4, v3 6262; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6263; GFX9-NEXT: s_cbranch_execnz .LBB108_1 6264; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6265; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6266; GFX9-NEXT: s_setpc_b64 s[30:31] 6267 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 6268 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 6269 ret void 6270} 6271 6272define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 6273; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: 6274; SI: ; %bb.0: 6275; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6276; SI-NEXT: s_mov_b32 s6, 0 6277; SI-NEXT: s_mov_b32 s7, 0xf000 6278; SI-NEXT: s_mov_b32 s4, s6 6279; SI-NEXT: s_mov_b32 s5, s6 6280; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 6281; SI-NEXT: s_mov_b64 s[8:9], 0 6282; SI-NEXT: .LBB109_1: ; %atomicrmw.start 6283; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6284; SI-NEXT: s_waitcnt vmcnt(0) 6285; SI-NEXT: v_mov_b32_e32 v5, v3 6286; SI-NEXT: s_waitcnt expcnt(0) 6287; SI-NEXT: v_max_u32_e32 v4, v5, v2 6288; SI-NEXT: v_mov_b32_e32 v3, v4 6289; SI-NEXT: v_mov_b32_e32 v4, v5 6290; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 6291; SI-NEXT: s_waitcnt vmcnt(0) 6292; SI-NEXT: buffer_wbinvl1 6293; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 6294; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6295; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6296; SI-NEXT: s_cbranch_execnz .LBB109_1 6297; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6298; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6299; SI-NEXT: v_mov_b32_e32 v0, v3 6300; SI-NEXT: s_waitcnt expcnt(0) 6301; SI-NEXT: s_setpc_b64 s[30:31] 6302; 6303; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: 6304; VI: ; %bb.0: 6305; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6306; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 6307; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 6308; VI-NEXT: flat_load_dword v0, v[3:4] 6309; VI-NEXT: s_mov_b64 s[4:5], 0 6310; VI-NEXT: .LBB109_1: ; %atomicrmw.start 6311; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6312; VI-NEXT: s_waitcnt vmcnt(0) 6313; VI-NEXT: v_mov_b32_e32 v1, v0 6314; VI-NEXT: v_max_u32_e32 v0, v1, v2 6315; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 6316; VI-NEXT: s_waitcnt vmcnt(0) 6317; VI-NEXT: buffer_wbinvl1_vol 6318; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6319; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6320; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6321; VI-NEXT: s_cbranch_execnz .LBB109_1 6322; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6323; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6324; VI-NEXT: s_setpc_b64 s[30:31] 6325; 6326; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: 6327; GFX9: ; %bb.0: 6328; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6329; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 6330; GFX9-NEXT: s_mov_b64 s[4:5], 0 6331; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start 6332; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6333; GFX9-NEXT: s_waitcnt vmcnt(0) 6334; GFX9-NEXT: v_mov_b32_e32 v4, v3 6335; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 6336; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 6337; GFX9-NEXT: s_waitcnt vmcnt(0) 6338; GFX9-NEXT: buffer_wbinvl1_vol 6339; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6340; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6341; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6342; GFX9-NEXT: s_cbranch_execnz .LBB109_1 6343; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6344; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6345; GFX9-NEXT: v_mov_b32_e32 v0, v3 6346; GFX9-NEXT: s_setpc_b64 s[30:31] 6347 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 6348 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 6349 ret i32 %result 6350} 6351 6352; --------------------------------------------------------------------- 6353; atomicrmw umin 6354; --------------------------------------------------------------------- 6355 6356define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 6357; SI-LABEL: global_atomic_umin_i32_noret: 6358; SI: ; %bb.0: 6359; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6360; SI-NEXT: s_mov_b32 s6, 0 6361; SI-NEXT: s_mov_b32 s7, 0xf000 6362; SI-NEXT: s_mov_b32 s4, s6 6363; SI-NEXT: s_mov_b32 s5, s6 6364; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 6365; SI-NEXT: s_mov_b64 s[8:9], 0 6366; SI-NEXT: .LBB110_1: ; %atomicrmw.start 6367; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6368; SI-NEXT: s_waitcnt vmcnt(0) 6369; SI-NEXT: v_min_u32_e32 v3, v4, v2 6370; SI-NEXT: s_waitcnt expcnt(0) 6371; SI-NEXT: v_mov_b32_e32 v6, v4 6372; SI-NEXT: v_mov_b32_e32 v5, v3 6373; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc 6374; SI-NEXT: s_waitcnt vmcnt(0) 6375; SI-NEXT: buffer_wbinvl1 6376; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 6377; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6378; SI-NEXT: v_mov_b32_e32 v4, v5 6379; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6380; SI-NEXT: s_cbranch_execnz .LBB110_1 6381; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6382; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6383; SI-NEXT: s_waitcnt expcnt(0) 6384; SI-NEXT: s_setpc_b64 s[30:31] 6385; 6386; VI-LABEL: global_atomic_umin_i32_noret: 6387; VI: ; %bb.0: 6388; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6389; VI-NEXT: flat_load_dword v4, v[0:1] 6390; VI-NEXT: s_mov_b64 s[4:5], 0 6391; VI-NEXT: .LBB110_1: ; %atomicrmw.start 6392; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6393; VI-NEXT: s_waitcnt vmcnt(0) 6394; VI-NEXT: v_min_u32_e32 v3, v4, v2 6395; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6396; VI-NEXT: s_waitcnt vmcnt(0) 6397; VI-NEXT: buffer_wbinvl1_vol 6398; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6399; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6400; VI-NEXT: v_mov_b32_e32 v4, v3 6401; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6402; VI-NEXT: s_cbranch_execnz .LBB110_1 6403; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6404; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6405; VI-NEXT: s_setpc_b64 s[30:31] 6406; 6407; GFX9-LABEL: global_atomic_umin_i32_noret: 6408; GFX9: ; %bb.0: 6409; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6410; GFX9-NEXT: global_load_dword v4, v[0:1], off 6411; GFX9-NEXT: s_mov_b64 s[4:5], 0 6412; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start 6413; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6414; GFX9-NEXT: s_waitcnt vmcnt(0) 6415; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 6416; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 6417; GFX9-NEXT: s_waitcnt vmcnt(0) 6418; GFX9-NEXT: buffer_wbinvl1_vol 6419; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6420; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6421; GFX9-NEXT: v_mov_b32_e32 v4, v3 6422; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6423; GFX9-NEXT: s_cbranch_execnz .LBB110_1 6424; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6425; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6426; GFX9-NEXT: s_setpc_b64 s[30:31] 6427 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst 6428 ret void 6429} 6430 6431define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 6432; SI-LABEL: global_atomic_umin_i32_noret_offset: 6433; SI: ; %bb.0: 6434; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6435; SI-NEXT: s_mov_b32 s6, 0 6436; SI-NEXT: s_mov_b32 s7, 0xf000 6437; SI-NEXT: s_mov_b32 s4, s6 6438; SI-NEXT: s_mov_b32 s5, s6 6439; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 6440; SI-NEXT: s_mov_b64 s[8:9], 0 6441; SI-NEXT: .LBB111_1: ; %atomicrmw.start 6442; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6443; SI-NEXT: s_waitcnt vmcnt(0) 6444; SI-NEXT: v_min_u32_e32 v3, v4, v2 6445; SI-NEXT: s_waitcnt expcnt(0) 6446; SI-NEXT: v_mov_b32_e32 v6, v4 6447; SI-NEXT: v_mov_b32_e32 v5, v3 6448; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 6449; SI-NEXT: s_waitcnt vmcnt(0) 6450; SI-NEXT: buffer_wbinvl1 6451; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 6452; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6453; SI-NEXT: v_mov_b32_e32 v4, v5 6454; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6455; SI-NEXT: s_cbranch_execnz .LBB111_1 6456; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6457; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6458; SI-NEXT: s_waitcnt expcnt(0) 6459; SI-NEXT: s_setpc_b64 s[30:31] 6460; 6461; VI-LABEL: global_atomic_umin_i32_noret_offset: 6462; VI: ; %bb.0: 6463; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6464; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 6465; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6466; VI-NEXT: flat_load_dword v4, v[0:1] 6467; VI-NEXT: s_mov_b64 s[4:5], 0 6468; VI-NEXT: .LBB111_1: ; %atomicrmw.start 6469; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6470; VI-NEXT: s_waitcnt vmcnt(0) 6471; VI-NEXT: v_min_u32_e32 v3, v4, v2 6472; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6473; VI-NEXT: s_waitcnt vmcnt(0) 6474; VI-NEXT: buffer_wbinvl1_vol 6475; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6476; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6477; VI-NEXT: v_mov_b32_e32 v4, v3 6478; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6479; VI-NEXT: s_cbranch_execnz .LBB111_1 6480; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6481; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6482; VI-NEXT: s_setpc_b64 s[30:31] 6483; 6484; GFX9-LABEL: global_atomic_umin_i32_noret_offset: 6485; GFX9: ; %bb.0: 6486; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6487; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 6488; GFX9-NEXT: s_mov_b64 s[4:5], 0 6489; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start 6490; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6491; GFX9-NEXT: s_waitcnt vmcnt(0) 6492; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 6493; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 6494; GFX9-NEXT: s_waitcnt vmcnt(0) 6495; GFX9-NEXT: buffer_wbinvl1_vol 6496; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6497; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6498; GFX9-NEXT: v_mov_b32_e32 v4, v3 6499; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6500; GFX9-NEXT: s_cbranch_execnz .LBB111_1 6501; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6502; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6503; GFX9-NEXT: s_setpc_b64 s[30:31] 6504 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 6505 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst 6506 ret void 6507} 6508 6509define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 6510; SI-LABEL: global_atomic_umin_i32_ret: 6511; SI: ; %bb.0: 6512; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6513; SI-NEXT: s_mov_b32 s6, 0 6514; SI-NEXT: s_mov_b32 s7, 0xf000 6515; SI-NEXT: s_mov_b32 s4, s6 6516; SI-NEXT: s_mov_b32 s5, s6 6517; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 6518; SI-NEXT: s_mov_b64 s[8:9], 0 6519; SI-NEXT: .LBB112_1: ; %atomicrmw.start 6520; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6521; SI-NEXT: s_waitcnt vmcnt(0) 6522; SI-NEXT: v_mov_b32_e32 v5, v3 6523; SI-NEXT: s_waitcnt expcnt(0) 6524; SI-NEXT: v_min_u32_e32 v4, v5, v2 6525; SI-NEXT: v_mov_b32_e32 v3, v4 6526; SI-NEXT: v_mov_b32_e32 v4, v5 6527; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc 6528; SI-NEXT: s_waitcnt vmcnt(0) 6529; SI-NEXT: buffer_wbinvl1 6530; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 6531; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6532; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6533; SI-NEXT: s_cbranch_execnz .LBB112_1 6534; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6535; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6536; SI-NEXT: v_mov_b32_e32 v0, v3 6537; SI-NEXT: s_waitcnt expcnt(0) 6538; SI-NEXT: s_setpc_b64 s[30:31] 6539; 6540; VI-LABEL: global_atomic_umin_i32_ret: 6541; VI: ; %bb.0: 6542; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6543; VI-NEXT: flat_load_dword v3, v[0:1] 6544; VI-NEXT: s_mov_b64 s[4:5], 0 6545; VI-NEXT: .LBB112_1: ; %atomicrmw.start 6546; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6547; VI-NEXT: s_waitcnt vmcnt(0) 6548; VI-NEXT: v_mov_b32_e32 v4, v3 6549; VI-NEXT: v_min_u32_e32 v3, v4, v2 6550; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6551; VI-NEXT: s_waitcnt vmcnt(0) 6552; VI-NEXT: buffer_wbinvl1_vol 6553; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6554; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6555; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6556; VI-NEXT: s_cbranch_execnz .LBB112_1 6557; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6558; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6559; VI-NEXT: v_mov_b32_e32 v0, v3 6560; VI-NEXT: s_setpc_b64 s[30:31] 6561; 6562; GFX9-LABEL: global_atomic_umin_i32_ret: 6563; GFX9: ; %bb.0: 6564; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6565; GFX9-NEXT: global_load_dword v3, v[0:1], off 6566; GFX9-NEXT: s_mov_b64 s[4:5], 0 6567; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start 6568; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6569; GFX9-NEXT: s_waitcnt vmcnt(0) 6570; GFX9-NEXT: v_mov_b32_e32 v4, v3 6571; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 6572; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 6573; GFX9-NEXT: s_waitcnt vmcnt(0) 6574; GFX9-NEXT: buffer_wbinvl1_vol 6575; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6576; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6577; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6578; GFX9-NEXT: s_cbranch_execnz .LBB112_1 6579; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6580; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6581; GFX9-NEXT: v_mov_b32_e32 v0, v3 6582; GFX9-NEXT: s_setpc_b64 s[30:31] 6583 %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst 6584 ret i32 %result 6585} 6586 6587define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 6588; SI-LABEL: global_atomic_umin_i32_ret_offset: 6589; SI: ; %bb.0: 6590; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6591; SI-NEXT: s_mov_b32 s6, 0 6592; SI-NEXT: s_mov_b32 s7, 0xf000 6593; SI-NEXT: s_mov_b32 s4, s6 6594; SI-NEXT: s_mov_b32 s5, s6 6595; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 6596; SI-NEXT: s_mov_b64 s[8:9], 0 6597; SI-NEXT: .LBB113_1: ; %atomicrmw.start 6598; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6599; SI-NEXT: s_waitcnt vmcnt(0) 6600; SI-NEXT: v_mov_b32_e32 v5, v3 6601; SI-NEXT: s_waitcnt expcnt(0) 6602; SI-NEXT: v_min_u32_e32 v4, v5, v2 6603; SI-NEXT: v_mov_b32_e32 v3, v4 6604; SI-NEXT: v_mov_b32_e32 v4, v5 6605; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 6606; SI-NEXT: s_waitcnt vmcnt(0) 6607; SI-NEXT: buffer_wbinvl1 6608; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 6609; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6610; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6611; SI-NEXT: s_cbranch_execnz .LBB113_1 6612; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6613; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6614; SI-NEXT: v_mov_b32_e32 v0, v3 6615; SI-NEXT: s_waitcnt expcnt(0) 6616; SI-NEXT: s_setpc_b64 s[30:31] 6617; 6618; VI-LABEL: global_atomic_umin_i32_ret_offset: 6619; VI: ; %bb.0: 6620; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6621; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 6622; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 6623; VI-NEXT: flat_load_dword v0, v[3:4] 6624; VI-NEXT: s_mov_b64 s[4:5], 0 6625; VI-NEXT: .LBB113_1: ; %atomicrmw.start 6626; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6627; VI-NEXT: s_waitcnt vmcnt(0) 6628; VI-NEXT: v_mov_b32_e32 v1, v0 6629; VI-NEXT: v_min_u32_e32 v0, v1, v2 6630; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 6631; VI-NEXT: s_waitcnt vmcnt(0) 6632; VI-NEXT: buffer_wbinvl1_vol 6633; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6634; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6635; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6636; VI-NEXT: s_cbranch_execnz .LBB113_1 6637; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6638; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6639; VI-NEXT: s_setpc_b64 s[30:31] 6640; 6641; GFX9-LABEL: global_atomic_umin_i32_ret_offset: 6642; GFX9: ; %bb.0: 6643; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6644; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 6645; GFX9-NEXT: s_mov_b64 s[4:5], 0 6646; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start 6647; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6648; GFX9-NEXT: s_waitcnt vmcnt(0) 6649; GFX9-NEXT: v_mov_b32_e32 v4, v3 6650; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 6651; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 6652; GFX9-NEXT: s_waitcnt vmcnt(0) 6653; GFX9-NEXT: buffer_wbinvl1_vol 6654; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6655; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6656; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6657; GFX9-NEXT: s_cbranch_execnz .LBB113_1 6658; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6659; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6660; GFX9-NEXT: v_mov_b32_e32 v0, v3 6661; GFX9-NEXT: s_setpc_b64 s[30:31] 6662 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 6663 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst 6664 ret i32 %result 6665} 6666 6667define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 6668; SI-LABEL: global_atomic_umin_i32_noret_scalar: 6669; SI: ; %bb.0: 6670; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6671; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6672; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 6673; SI-NEXT: s_mov_b64 exec, s[34:35] 6674; SI-NEXT: s_waitcnt expcnt(0) 6675; SI-NEXT: v_writelane_b32 v4, s6, 0 6676; SI-NEXT: v_writelane_b32 v4, s7, 1 6677; SI-NEXT: s_mov_b32 s34, s6 6678; SI-NEXT: s_mov_b32 s7, 0xf000 6679; SI-NEXT: s_mov_b32 s6, -1 6680; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 6681; SI-NEXT: s_mov_b64 s[36:37], 0 6682; SI-NEXT: .LBB114_1: ; %atomicrmw.start 6683; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6684; SI-NEXT: s_waitcnt vmcnt(0) 6685; SI-NEXT: v_min_u32_e32 v0, s34, v1 6686; SI-NEXT: s_waitcnt expcnt(0) 6687; SI-NEXT: v_mov_b32_e32 v3, v1 6688; SI-NEXT: v_mov_b32_e32 v2, v0 6689; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 6690; SI-NEXT: s_waitcnt vmcnt(0) 6691; SI-NEXT: buffer_wbinvl1 6692; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 6693; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6694; SI-NEXT: v_mov_b32_e32 v1, v2 6695; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6696; SI-NEXT: s_cbranch_execnz .LBB114_1 6697; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6698; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6699; SI-NEXT: v_readlane_b32 s7, v4, 1 6700; SI-NEXT: v_readlane_b32 s6, v4, 0 6701; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6702; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 6703; SI-NEXT: s_mov_b64 exec, s[34:35] 6704; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6705; SI-NEXT: s_setpc_b64 s[30:31] 6706; 6707; VI-LABEL: global_atomic_umin_i32_noret_scalar: 6708; VI: ; %bb.0: 6709; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6710; VI-NEXT: v_mov_b32_e32 v0, s4 6711; VI-NEXT: v_mov_b32_e32 v1, s5 6712; VI-NEXT: flat_load_dword v3, v[0:1] 6713; VI-NEXT: s_mov_b64 s[34:35], 0 6714; VI-NEXT: .LBB114_1: ; %atomicrmw.start 6715; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6716; VI-NEXT: s_waitcnt vmcnt(0) 6717; VI-NEXT: v_min_u32_e32 v2, s6, v3 6718; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6719; VI-NEXT: s_waitcnt vmcnt(0) 6720; VI-NEXT: buffer_wbinvl1_vol 6721; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6722; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6723; VI-NEXT: v_mov_b32_e32 v3, v2 6724; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6725; VI-NEXT: s_cbranch_execnz .LBB114_1 6726; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6727; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6728; VI-NEXT: s_setpc_b64 s[30:31] 6729; 6730; GFX9-LABEL: global_atomic_umin_i32_noret_scalar: 6731; GFX9: ; %bb.0: 6732; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6733; GFX9-NEXT: v_mov_b32_e32 v2, 0 6734; GFX9-NEXT: global_load_dword v1, v2, s[4:5] 6735; GFX9-NEXT: s_mov_b64 s[34:35], 0 6736; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start 6737; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6738; GFX9-NEXT: s_waitcnt vmcnt(0) 6739; GFX9-NEXT: v_min_u32_e32 v0, s6, v1 6740; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc 6741; GFX9-NEXT: s_waitcnt vmcnt(0) 6742; GFX9-NEXT: buffer_wbinvl1_vol 6743; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6744; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6745; GFX9-NEXT: v_mov_b32_e32 v1, v0 6746; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 6747; GFX9-NEXT: s_cbranch_execnz .LBB114_1 6748; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6749; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 6750; GFX9-NEXT: s_setpc_b64 s[30:31] 6751 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst 6752 ret void 6753} 6754 6755define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 6756; SI-LABEL: global_atomic_umin_i32_noret_offset_scalar: 6757; SI: ; %bb.0: 6758; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6759; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6760; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 6761; SI-NEXT: s_mov_b64 exec, s[34:35] 6762; SI-NEXT: s_waitcnt expcnt(0) 6763; SI-NEXT: v_writelane_b32 v4, s6, 0 6764; SI-NEXT: v_writelane_b32 v4, s7, 1 6765; SI-NEXT: s_mov_b32 s34, s6 6766; SI-NEXT: s_mov_b32 s7, 0xf000 6767; SI-NEXT: s_mov_b32 s6, -1 6768; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 6769; SI-NEXT: s_mov_b64 s[36:37], 0 6770; SI-NEXT: .LBB115_1: ; %atomicrmw.start 6771; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6772; SI-NEXT: s_waitcnt vmcnt(0) 6773; SI-NEXT: v_min_u32_e32 v0, s34, v1 6774; SI-NEXT: s_waitcnt expcnt(0) 6775; SI-NEXT: v_mov_b32_e32 v3, v1 6776; SI-NEXT: v_mov_b32_e32 v2, v0 6777; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 6778; SI-NEXT: s_waitcnt vmcnt(0) 6779; SI-NEXT: buffer_wbinvl1 6780; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 6781; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6782; SI-NEXT: v_mov_b32_e32 v1, v2 6783; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6784; SI-NEXT: s_cbranch_execnz .LBB115_1 6785; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6786; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6787; SI-NEXT: v_readlane_b32 s7, v4, 1 6788; SI-NEXT: v_readlane_b32 s6, v4, 0 6789; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6790; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 6791; SI-NEXT: s_mov_b64 exec, s[34:35] 6792; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6793; SI-NEXT: s_setpc_b64 s[30:31] 6794; 6795; VI-LABEL: global_atomic_umin_i32_noret_offset_scalar: 6796; VI: ; %bb.0: 6797; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6798; VI-NEXT: s_add_u32 s34, s4, 16 6799; VI-NEXT: s_addc_u32 s35, s5, 0 6800; VI-NEXT: v_mov_b32_e32 v0, s34 6801; VI-NEXT: v_mov_b32_e32 v1, s35 6802; VI-NEXT: flat_load_dword v3, v[0:1] 6803; VI-NEXT: s_mov_b64 s[34:35], 0 6804; VI-NEXT: .LBB115_1: ; %atomicrmw.start 6805; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6806; VI-NEXT: s_waitcnt vmcnt(0) 6807; VI-NEXT: v_min_u32_e32 v2, s6, v3 6808; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6809; VI-NEXT: s_waitcnt vmcnt(0) 6810; VI-NEXT: buffer_wbinvl1_vol 6811; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6812; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6813; VI-NEXT: v_mov_b32_e32 v3, v2 6814; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6815; VI-NEXT: s_cbranch_execnz .LBB115_1 6816; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6817; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6818; VI-NEXT: s_setpc_b64 s[30:31] 6819; 6820; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar: 6821; GFX9: ; %bb.0: 6822; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6823; GFX9-NEXT: v_mov_b32_e32 v2, 0 6824; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 6825; GFX9-NEXT: s_mov_b64 s[34:35], 0 6826; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start 6827; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6828; GFX9-NEXT: s_waitcnt vmcnt(0) 6829; GFX9-NEXT: v_min_u32_e32 v0, s6, v1 6830; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc 6831; GFX9-NEXT: s_waitcnt vmcnt(0) 6832; GFX9-NEXT: buffer_wbinvl1_vol 6833; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6834; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6835; GFX9-NEXT: v_mov_b32_e32 v1, v0 6836; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 6837; GFX9-NEXT: s_cbranch_execnz .LBB115_1 6838; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6839; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 6840; GFX9-NEXT: s_setpc_b64 s[30:31] 6841 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 6842 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst 6843 ret void 6844} 6845 6846define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 6847; SI-LABEL: global_atomic_umin_i32_ret_scalar: 6848; SI: ; %bb.0: 6849; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6850; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6851; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 6852; SI-NEXT: s_mov_b64 exec, s[34:35] 6853; SI-NEXT: s_waitcnt expcnt(0) 6854; SI-NEXT: v_writelane_b32 v3, s6, 0 6855; SI-NEXT: v_writelane_b32 v3, s7, 1 6856; SI-NEXT: s_mov_b32 s34, s6 6857; SI-NEXT: s_mov_b32 s7, 0xf000 6858; SI-NEXT: s_mov_b32 s6, -1 6859; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 6860; SI-NEXT: s_mov_b64 s[36:37], 0 6861; SI-NEXT: .LBB116_1: ; %atomicrmw.start 6862; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6863; SI-NEXT: s_waitcnt vmcnt(0) 6864; SI-NEXT: v_mov_b32_e32 v2, v0 6865; SI-NEXT: s_waitcnt expcnt(0) 6866; SI-NEXT: v_min_u32_e32 v1, s34, v2 6867; SI-NEXT: v_mov_b32_e32 v0, v1 6868; SI-NEXT: v_mov_b32_e32 v1, v2 6869; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc 6870; SI-NEXT: s_waitcnt vmcnt(0) 6871; SI-NEXT: buffer_wbinvl1 6872; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 6873; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6874; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6875; SI-NEXT: s_cbranch_execnz .LBB116_1 6876; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6877; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6878; SI-NEXT: v_readlane_b32 s7, v3, 1 6879; SI-NEXT: v_readlane_b32 s6, v3, 0 6880; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6881; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 6882; SI-NEXT: s_mov_b64 exec, s[34:35] 6883; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6884; SI-NEXT: s_setpc_b64 s[30:31] 6885; 6886; VI-LABEL: global_atomic_umin_i32_ret_scalar: 6887; VI: ; %bb.0: 6888; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6889; VI-NEXT: v_mov_b32_e32 v0, s4 6890; VI-NEXT: v_mov_b32_e32 v1, s5 6891; VI-NEXT: flat_load_dword v0, v[0:1] 6892; VI-NEXT: v_mov_b32_e32 v1, s4 6893; VI-NEXT: s_mov_b64 s[34:35], 0 6894; VI-NEXT: v_mov_b32_e32 v2, s5 6895; VI-NEXT: .LBB116_1: ; %atomicrmw.start 6896; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6897; VI-NEXT: s_waitcnt vmcnt(0) 6898; VI-NEXT: v_mov_b32_e32 v4, v0 6899; VI-NEXT: v_min_u32_e32 v3, s6, v4 6900; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6901; VI-NEXT: s_waitcnt vmcnt(0) 6902; VI-NEXT: buffer_wbinvl1_vol 6903; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6904; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6905; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6906; VI-NEXT: s_cbranch_execnz .LBB116_1 6907; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6908; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6909; VI-NEXT: s_setpc_b64 s[30:31] 6910; 6911; GFX9-LABEL: global_atomic_umin_i32_ret_scalar: 6912; GFX9: ; %bb.0: 6913; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6914; GFX9-NEXT: v_mov_b32_e32 v1, 0 6915; GFX9-NEXT: global_load_dword v0, v1, s[4:5] 6916; GFX9-NEXT: s_mov_b64 s[34:35], 0 6917; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start 6918; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6919; GFX9-NEXT: s_waitcnt vmcnt(0) 6920; GFX9-NEXT: v_mov_b32_e32 v3, v0 6921; GFX9-NEXT: v_min_u32_e32 v2, s6, v3 6922; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc 6923; GFX9-NEXT: s_waitcnt vmcnt(0) 6924; GFX9-NEXT: buffer_wbinvl1_vol 6925; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 6926; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6927; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 6928; GFX9-NEXT: s_cbranch_execnz .LBB116_1 6929; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6930; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 6931; GFX9-NEXT: s_setpc_b64 s[30:31] 6932 %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst 6933 ret i32 %result 6934} 6935 6936define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 6937; SI-LABEL: global_atomic_umin_i32_ret_offset_scalar: 6938; SI: ; %bb.0: 6939; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6940; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6941; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 6942; SI-NEXT: s_mov_b64 exec, s[34:35] 6943; SI-NEXT: s_waitcnt expcnt(0) 6944; SI-NEXT: v_writelane_b32 v3, s6, 0 6945; SI-NEXT: v_writelane_b32 v3, s7, 1 6946; SI-NEXT: s_mov_b32 s34, s6 6947; SI-NEXT: s_mov_b32 s7, 0xf000 6948; SI-NEXT: s_mov_b32 s6, -1 6949; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 6950; SI-NEXT: s_mov_b64 s[36:37], 0 6951; SI-NEXT: .LBB117_1: ; %atomicrmw.start 6952; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6953; SI-NEXT: s_waitcnt vmcnt(0) 6954; SI-NEXT: v_mov_b32_e32 v2, v0 6955; SI-NEXT: s_waitcnt expcnt(0) 6956; SI-NEXT: v_min_u32_e32 v1, s34, v2 6957; SI-NEXT: v_mov_b32_e32 v0, v1 6958; SI-NEXT: v_mov_b32_e32 v1, v2 6959; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 6960; SI-NEXT: s_waitcnt vmcnt(0) 6961; SI-NEXT: buffer_wbinvl1 6962; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 6963; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6964; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6965; SI-NEXT: s_cbranch_execnz .LBB117_1 6966; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6967; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6968; SI-NEXT: v_readlane_b32 s7, v3, 1 6969; SI-NEXT: v_readlane_b32 s6, v3, 0 6970; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6971; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 6972; SI-NEXT: s_mov_b64 exec, s[34:35] 6973; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6974; SI-NEXT: s_setpc_b64 s[30:31] 6975; 6976; VI-LABEL: global_atomic_umin_i32_ret_offset_scalar: 6977; VI: ; %bb.0: 6978; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6979; VI-NEXT: s_add_u32 s34, s4, 16 6980; VI-NEXT: s_addc_u32 s35, s5, 0 6981; VI-NEXT: v_mov_b32_e32 v1, s34 6982; VI-NEXT: v_mov_b32_e32 v2, s35 6983; VI-NEXT: flat_load_dword v0, v[1:2] 6984; VI-NEXT: s_mov_b64 s[34:35], 0 6985; VI-NEXT: .LBB117_1: ; %atomicrmw.start 6986; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6987; VI-NEXT: s_waitcnt vmcnt(0) 6988; VI-NEXT: v_mov_b32_e32 v4, v0 6989; VI-NEXT: v_min_u32_e32 v3, s6, v4 6990; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6991; VI-NEXT: s_waitcnt vmcnt(0) 6992; VI-NEXT: buffer_wbinvl1_vol 6993; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6994; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6995; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6996; VI-NEXT: s_cbranch_execnz .LBB117_1 6997; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6998; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6999; VI-NEXT: s_setpc_b64 s[30:31] 7000; 7001; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: 7002; GFX9: ; %bb.0: 7003; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7004; GFX9-NEXT: v_mov_b32_e32 v1, 0 7005; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 7006; GFX9-NEXT: s_mov_b64 s[34:35], 0 7007; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start 7008; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7009; GFX9-NEXT: s_waitcnt vmcnt(0) 7010; GFX9-NEXT: v_mov_b32_e32 v3, v0 7011; GFX9-NEXT: v_min_u32_e32 v2, s6, v3 7012; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc 7013; GFX9-NEXT: s_waitcnt vmcnt(0) 7014; GFX9-NEXT: buffer_wbinvl1_vol 7015; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 7016; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7017; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7018; GFX9-NEXT: s_cbranch_execnz .LBB117_1 7019; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7020; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7021; GFX9-NEXT: s_setpc_b64 s[30:31] 7022 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 7023 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst 7024 ret i32 %result 7025} 7026 7027define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 7028; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory: 7029; SI: ; %bb.0: 7030; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7031; SI-NEXT: s_mov_b32 s6, 0 7032; SI-NEXT: s_mov_b32 s7, 0xf000 7033; SI-NEXT: s_mov_b32 s4, s6 7034; SI-NEXT: s_mov_b32 s5, s6 7035; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 7036; SI-NEXT: s_mov_b64 s[8:9], 0 7037; SI-NEXT: .LBB118_1: ; %atomicrmw.start 7038; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7039; SI-NEXT: s_waitcnt vmcnt(0) 7040; SI-NEXT: v_min_u32_e32 v3, v4, v2 7041; SI-NEXT: s_waitcnt expcnt(0) 7042; SI-NEXT: v_mov_b32_e32 v6, v4 7043; SI-NEXT: v_mov_b32_e32 v5, v3 7044; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 7045; SI-NEXT: s_waitcnt vmcnt(0) 7046; SI-NEXT: buffer_wbinvl1 7047; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 7048; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7049; SI-NEXT: v_mov_b32_e32 v4, v5 7050; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7051; SI-NEXT: s_cbranch_execnz .LBB118_1 7052; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7053; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7054; SI-NEXT: s_waitcnt expcnt(0) 7055; SI-NEXT: s_setpc_b64 s[30:31] 7056; 7057; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory: 7058; VI: ; %bb.0: 7059; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7060; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7061; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7062; VI-NEXT: flat_load_dword v4, v[0:1] 7063; VI-NEXT: s_mov_b64 s[4:5], 0 7064; VI-NEXT: .LBB118_1: ; %atomicrmw.start 7065; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7066; VI-NEXT: s_waitcnt vmcnt(0) 7067; VI-NEXT: v_min_u32_e32 v3, v4, v2 7068; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7069; VI-NEXT: s_waitcnt vmcnt(0) 7070; VI-NEXT: buffer_wbinvl1_vol 7071; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7072; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7073; VI-NEXT: v_mov_b32_e32 v4, v3 7074; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7075; VI-NEXT: s_cbranch_execnz .LBB118_1 7076; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7077; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7078; VI-NEXT: s_setpc_b64 s[30:31] 7079; 7080; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory: 7081; GFX9: ; %bb.0: 7082; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7083; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 7084; GFX9-NEXT: s_mov_b64 s[4:5], 0 7085; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start 7086; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7087; GFX9-NEXT: s_waitcnt vmcnt(0) 7088; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 7089; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 7090; GFX9-NEXT: s_waitcnt vmcnt(0) 7091; GFX9-NEXT: buffer_wbinvl1_vol 7092; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7093; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7094; GFX9-NEXT: v_mov_b32_e32 v4, v3 7095; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7096; GFX9-NEXT: s_cbranch_execnz .LBB118_1 7097; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7098; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7099; GFX9-NEXT: s_setpc_b64 s[30:31] 7100 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 7101 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7102 ret void 7103} 7104 7105define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 7106; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: 7107; SI: ; %bb.0: 7108; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7109; SI-NEXT: s_mov_b32 s6, 0 7110; SI-NEXT: s_mov_b32 s7, 0xf000 7111; SI-NEXT: s_mov_b32 s4, s6 7112; SI-NEXT: s_mov_b32 s5, s6 7113; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 7114; SI-NEXT: s_mov_b64 s[8:9], 0 7115; SI-NEXT: .LBB119_1: ; %atomicrmw.start 7116; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7117; SI-NEXT: s_waitcnt vmcnt(0) 7118; SI-NEXT: v_mov_b32_e32 v5, v3 7119; SI-NEXT: s_waitcnt expcnt(0) 7120; SI-NEXT: v_min_u32_e32 v4, v5, v2 7121; SI-NEXT: v_mov_b32_e32 v3, v4 7122; SI-NEXT: v_mov_b32_e32 v4, v5 7123; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 7124; SI-NEXT: s_waitcnt vmcnt(0) 7125; SI-NEXT: buffer_wbinvl1 7126; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7127; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7128; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7129; SI-NEXT: s_cbranch_execnz .LBB119_1 7130; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7131; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7132; SI-NEXT: v_mov_b32_e32 v0, v3 7133; SI-NEXT: s_waitcnt expcnt(0) 7134; SI-NEXT: s_setpc_b64 s[30:31] 7135; 7136; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: 7137; VI: ; %bb.0: 7138; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7139; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 7140; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 7141; VI-NEXT: flat_load_dword v0, v[3:4] 7142; VI-NEXT: s_mov_b64 s[4:5], 0 7143; VI-NEXT: .LBB119_1: ; %atomicrmw.start 7144; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7145; VI-NEXT: s_waitcnt vmcnt(0) 7146; VI-NEXT: v_mov_b32_e32 v1, v0 7147; VI-NEXT: v_min_u32_e32 v0, v1, v2 7148; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 7149; VI-NEXT: s_waitcnt vmcnt(0) 7150; VI-NEXT: buffer_wbinvl1_vol 7151; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7152; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7153; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7154; VI-NEXT: s_cbranch_execnz .LBB119_1 7155; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7156; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7157; VI-NEXT: s_setpc_b64 s[30:31] 7158; 7159; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: 7160; GFX9: ; %bb.0: 7161; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7162; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 7163; GFX9-NEXT: s_mov_b64 s[4:5], 0 7164; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start 7165; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7166; GFX9-NEXT: s_waitcnt vmcnt(0) 7167; GFX9-NEXT: v_mov_b32_e32 v4, v3 7168; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 7169; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 7170; GFX9-NEXT: s_waitcnt vmcnt(0) 7171; GFX9-NEXT: buffer_wbinvl1_vol 7172; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7173; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7174; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7175; GFX9-NEXT: s_cbranch_execnz .LBB119_1 7176; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7177; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7178; GFX9-NEXT: v_mov_b32_e32 v0, v3 7179; GFX9-NEXT: s_setpc_b64 s[30:31] 7180 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 7181 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7182 ret i32 %result 7183} 7184 7185; --------------------------------------------------------------------- 7186; atomicrmw min 7187; --------------------------------------------------------------------- 7188 7189define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 7190; SI-LABEL: global_atomic_min_i32_noret: 7191; SI: ; %bb.0: 7192; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7193; SI-NEXT: s_mov_b32 s6, 0 7194; SI-NEXT: s_mov_b32 s7, 0xf000 7195; SI-NEXT: s_mov_b32 s4, s6 7196; SI-NEXT: s_mov_b32 s5, s6 7197; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 7198; SI-NEXT: s_mov_b64 s[8:9], 0 7199; SI-NEXT: .LBB120_1: ; %atomicrmw.start 7200; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7201; SI-NEXT: s_waitcnt vmcnt(0) 7202; SI-NEXT: v_min_i32_e32 v3, v4, v2 7203; SI-NEXT: s_waitcnt expcnt(0) 7204; SI-NEXT: v_mov_b32_e32 v6, v4 7205; SI-NEXT: v_mov_b32_e32 v5, v3 7206; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc 7207; SI-NEXT: s_waitcnt vmcnt(0) 7208; SI-NEXT: buffer_wbinvl1 7209; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 7210; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7211; SI-NEXT: v_mov_b32_e32 v4, v5 7212; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7213; SI-NEXT: s_cbranch_execnz .LBB120_1 7214; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7215; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7216; SI-NEXT: s_waitcnt expcnt(0) 7217; SI-NEXT: s_setpc_b64 s[30:31] 7218; 7219; VI-LABEL: global_atomic_min_i32_noret: 7220; VI: ; %bb.0: 7221; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7222; VI-NEXT: flat_load_dword v4, v[0:1] 7223; VI-NEXT: s_mov_b64 s[4:5], 0 7224; VI-NEXT: .LBB120_1: ; %atomicrmw.start 7225; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7226; VI-NEXT: s_waitcnt vmcnt(0) 7227; VI-NEXT: v_min_i32_e32 v3, v4, v2 7228; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7229; VI-NEXT: s_waitcnt vmcnt(0) 7230; VI-NEXT: buffer_wbinvl1_vol 7231; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7232; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7233; VI-NEXT: v_mov_b32_e32 v4, v3 7234; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7235; VI-NEXT: s_cbranch_execnz .LBB120_1 7236; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7237; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7238; VI-NEXT: s_setpc_b64 s[30:31] 7239; 7240; GFX9-LABEL: global_atomic_min_i32_noret: 7241; GFX9: ; %bb.0: 7242; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7243; GFX9-NEXT: global_load_dword v4, v[0:1], off 7244; GFX9-NEXT: s_mov_b64 s[4:5], 0 7245; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start 7246; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7247; GFX9-NEXT: s_waitcnt vmcnt(0) 7248; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 7249; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 7250; GFX9-NEXT: s_waitcnt vmcnt(0) 7251; GFX9-NEXT: buffer_wbinvl1_vol 7252; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7253; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7254; GFX9-NEXT: v_mov_b32_e32 v4, v3 7255; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7256; GFX9-NEXT: s_cbranch_execnz .LBB120_1 7257; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7258; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7259; GFX9-NEXT: s_setpc_b64 s[30:31] 7260 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst 7261 ret void 7262} 7263 7264define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 7265; SI-LABEL: global_atomic_min_i32_noret_offset: 7266; SI: ; %bb.0: 7267; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7268; SI-NEXT: s_mov_b32 s6, 0 7269; SI-NEXT: s_mov_b32 s7, 0xf000 7270; SI-NEXT: s_mov_b32 s4, s6 7271; SI-NEXT: s_mov_b32 s5, s6 7272; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 7273; SI-NEXT: s_mov_b64 s[8:9], 0 7274; SI-NEXT: .LBB121_1: ; %atomicrmw.start 7275; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7276; SI-NEXT: s_waitcnt vmcnt(0) 7277; SI-NEXT: v_min_i32_e32 v3, v4, v2 7278; SI-NEXT: s_waitcnt expcnt(0) 7279; SI-NEXT: v_mov_b32_e32 v6, v4 7280; SI-NEXT: v_mov_b32_e32 v5, v3 7281; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 7282; SI-NEXT: s_waitcnt vmcnt(0) 7283; SI-NEXT: buffer_wbinvl1 7284; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 7285; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7286; SI-NEXT: v_mov_b32_e32 v4, v5 7287; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7288; SI-NEXT: s_cbranch_execnz .LBB121_1 7289; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7290; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7291; SI-NEXT: s_waitcnt expcnt(0) 7292; SI-NEXT: s_setpc_b64 s[30:31] 7293; 7294; VI-LABEL: global_atomic_min_i32_noret_offset: 7295; VI: ; %bb.0: 7296; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7297; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7298; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7299; VI-NEXT: flat_load_dword v4, v[0:1] 7300; VI-NEXT: s_mov_b64 s[4:5], 0 7301; VI-NEXT: .LBB121_1: ; %atomicrmw.start 7302; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7303; VI-NEXT: s_waitcnt vmcnt(0) 7304; VI-NEXT: v_min_i32_e32 v3, v4, v2 7305; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7306; VI-NEXT: s_waitcnt vmcnt(0) 7307; VI-NEXT: buffer_wbinvl1_vol 7308; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7309; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7310; VI-NEXT: v_mov_b32_e32 v4, v3 7311; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7312; VI-NEXT: s_cbranch_execnz .LBB121_1 7313; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7314; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7315; VI-NEXT: s_setpc_b64 s[30:31] 7316; 7317; GFX9-LABEL: global_atomic_min_i32_noret_offset: 7318; GFX9: ; %bb.0: 7319; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7320; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 7321; GFX9-NEXT: s_mov_b64 s[4:5], 0 7322; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start 7323; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7324; GFX9-NEXT: s_waitcnt vmcnt(0) 7325; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 7326; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 7327; GFX9-NEXT: s_waitcnt vmcnt(0) 7328; GFX9-NEXT: buffer_wbinvl1_vol 7329; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7330; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7331; GFX9-NEXT: v_mov_b32_e32 v4, v3 7332; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7333; GFX9-NEXT: s_cbranch_execnz .LBB121_1 7334; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7335; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7336; GFX9-NEXT: s_setpc_b64 s[30:31] 7337 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 7338 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst 7339 ret void 7340} 7341 7342define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 7343; SI-LABEL: global_atomic_min_i32_ret: 7344; SI: ; %bb.0: 7345; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7346; SI-NEXT: s_mov_b32 s6, 0 7347; SI-NEXT: s_mov_b32 s7, 0xf000 7348; SI-NEXT: s_mov_b32 s4, s6 7349; SI-NEXT: s_mov_b32 s5, s6 7350; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 7351; SI-NEXT: s_mov_b64 s[8:9], 0 7352; SI-NEXT: .LBB122_1: ; %atomicrmw.start 7353; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7354; SI-NEXT: s_waitcnt vmcnt(0) 7355; SI-NEXT: v_mov_b32_e32 v5, v3 7356; SI-NEXT: s_waitcnt expcnt(0) 7357; SI-NEXT: v_min_i32_e32 v4, v5, v2 7358; SI-NEXT: v_mov_b32_e32 v3, v4 7359; SI-NEXT: v_mov_b32_e32 v4, v5 7360; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc 7361; SI-NEXT: s_waitcnt vmcnt(0) 7362; SI-NEXT: buffer_wbinvl1 7363; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7364; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7365; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7366; SI-NEXT: s_cbranch_execnz .LBB122_1 7367; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7368; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7369; SI-NEXT: v_mov_b32_e32 v0, v3 7370; SI-NEXT: s_waitcnt expcnt(0) 7371; SI-NEXT: s_setpc_b64 s[30:31] 7372; 7373; VI-LABEL: global_atomic_min_i32_ret: 7374; VI: ; %bb.0: 7375; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7376; VI-NEXT: flat_load_dword v3, v[0:1] 7377; VI-NEXT: s_mov_b64 s[4:5], 0 7378; VI-NEXT: .LBB122_1: ; %atomicrmw.start 7379; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7380; VI-NEXT: s_waitcnt vmcnt(0) 7381; VI-NEXT: v_mov_b32_e32 v4, v3 7382; VI-NEXT: v_min_i32_e32 v3, v4, v2 7383; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7384; VI-NEXT: s_waitcnt vmcnt(0) 7385; VI-NEXT: buffer_wbinvl1_vol 7386; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7387; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7388; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7389; VI-NEXT: s_cbranch_execnz .LBB122_1 7390; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7391; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7392; VI-NEXT: v_mov_b32_e32 v0, v3 7393; VI-NEXT: s_setpc_b64 s[30:31] 7394; 7395; GFX9-LABEL: global_atomic_min_i32_ret: 7396; GFX9: ; %bb.0: 7397; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7398; GFX9-NEXT: global_load_dword v3, v[0:1], off 7399; GFX9-NEXT: s_mov_b64 s[4:5], 0 7400; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start 7401; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7402; GFX9-NEXT: s_waitcnt vmcnt(0) 7403; GFX9-NEXT: v_mov_b32_e32 v4, v3 7404; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 7405; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 7406; GFX9-NEXT: s_waitcnt vmcnt(0) 7407; GFX9-NEXT: buffer_wbinvl1_vol 7408; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7409; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7410; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7411; GFX9-NEXT: s_cbranch_execnz .LBB122_1 7412; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7413; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7414; GFX9-NEXT: v_mov_b32_e32 v0, v3 7415; GFX9-NEXT: s_setpc_b64 s[30:31] 7416 %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst 7417 ret i32 %result 7418} 7419 7420define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 7421; SI-LABEL: global_atomic_min_i32_ret_offset: 7422; SI: ; %bb.0: 7423; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7424; SI-NEXT: s_mov_b32 s6, 0 7425; SI-NEXT: s_mov_b32 s7, 0xf000 7426; SI-NEXT: s_mov_b32 s4, s6 7427; SI-NEXT: s_mov_b32 s5, s6 7428; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 7429; SI-NEXT: s_mov_b64 s[8:9], 0 7430; SI-NEXT: .LBB123_1: ; %atomicrmw.start 7431; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7432; SI-NEXT: s_waitcnt vmcnt(0) 7433; SI-NEXT: v_mov_b32_e32 v5, v3 7434; SI-NEXT: s_waitcnt expcnt(0) 7435; SI-NEXT: v_min_i32_e32 v4, v5, v2 7436; SI-NEXT: v_mov_b32_e32 v3, v4 7437; SI-NEXT: v_mov_b32_e32 v4, v5 7438; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 7439; SI-NEXT: s_waitcnt vmcnt(0) 7440; SI-NEXT: buffer_wbinvl1 7441; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7442; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7443; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7444; SI-NEXT: s_cbranch_execnz .LBB123_1 7445; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7446; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7447; SI-NEXT: v_mov_b32_e32 v0, v3 7448; SI-NEXT: s_waitcnt expcnt(0) 7449; SI-NEXT: s_setpc_b64 s[30:31] 7450; 7451; VI-LABEL: global_atomic_min_i32_ret_offset: 7452; VI: ; %bb.0: 7453; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7454; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 7455; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 7456; VI-NEXT: flat_load_dword v0, v[3:4] 7457; VI-NEXT: s_mov_b64 s[4:5], 0 7458; VI-NEXT: .LBB123_1: ; %atomicrmw.start 7459; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7460; VI-NEXT: s_waitcnt vmcnt(0) 7461; VI-NEXT: v_mov_b32_e32 v1, v0 7462; VI-NEXT: v_min_i32_e32 v0, v1, v2 7463; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 7464; VI-NEXT: s_waitcnt vmcnt(0) 7465; VI-NEXT: buffer_wbinvl1_vol 7466; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7467; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7468; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7469; VI-NEXT: s_cbranch_execnz .LBB123_1 7470; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7471; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7472; VI-NEXT: s_setpc_b64 s[30:31] 7473; 7474; GFX9-LABEL: global_atomic_min_i32_ret_offset: 7475; GFX9: ; %bb.0: 7476; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7477; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 7478; GFX9-NEXT: s_mov_b64 s[4:5], 0 7479; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start 7480; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7481; GFX9-NEXT: s_waitcnt vmcnt(0) 7482; GFX9-NEXT: v_mov_b32_e32 v4, v3 7483; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 7484; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 7485; GFX9-NEXT: s_waitcnt vmcnt(0) 7486; GFX9-NEXT: buffer_wbinvl1_vol 7487; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7488; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7489; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7490; GFX9-NEXT: s_cbranch_execnz .LBB123_1 7491; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7492; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7493; GFX9-NEXT: v_mov_b32_e32 v0, v3 7494; GFX9-NEXT: s_setpc_b64 s[30:31] 7495 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 7496 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst 7497 ret i32 %result 7498} 7499 7500define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 7501; SI-LABEL: global_atomic_min_i32_noret_scalar: 7502; SI: ; %bb.0: 7503; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7504; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7505; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 7506; SI-NEXT: s_mov_b64 exec, s[34:35] 7507; SI-NEXT: s_waitcnt expcnt(0) 7508; SI-NEXT: v_writelane_b32 v4, s6, 0 7509; SI-NEXT: v_writelane_b32 v4, s7, 1 7510; SI-NEXT: s_mov_b32 s34, s6 7511; SI-NEXT: s_mov_b32 s7, 0xf000 7512; SI-NEXT: s_mov_b32 s6, -1 7513; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 7514; SI-NEXT: s_mov_b64 s[36:37], 0 7515; SI-NEXT: .LBB124_1: ; %atomicrmw.start 7516; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7517; SI-NEXT: s_waitcnt vmcnt(0) 7518; SI-NEXT: v_min_i32_e32 v0, s34, v1 7519; SI-NEXT: s_waitcnt expcnt(0) 7520; SI-NEXT: v_mov_b32_e32 v3, v1 7521; SI-NEXT: v_mov_b32_e32 v2, v0 7522; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 7523; SI-NEXT: s_waitcnt vmcnt(0) 7524; SI-NEXT: buffer_wbinvl1 7525; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 7526; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7527; SI-NEXT: v_mov_b32_e32 v1, v2 7528; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7529; SI-NEXT: s_cbranch_execnz .LBB124_1 7530; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7531; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7532; SI-NEXT: v_readlane_b32 s7, v4, 1 7533; SI-NEXT: v_readlane_b32 s6, v4, 0 7534; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7535; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 7536; SI-NEXT: s_mov_b64 exec, s[34:35] 7537; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7538; SI-NEXT: s_setpc_b64 s[30:31] 7539; 7540; VI-LABEL: global_atomic_min_i32_noret_scalar: 7541; VI: ; %bb.0: 7542; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7543; VI-NEXT: v_mov_b32_e32 v0, s4 7544; VI-NEXT: v_mov_b32_e32 v1, s5 7545; VI-NEXT: flat_load_dword v3, v[0:1] 7546; VI-NEXT: s_mov_b64 s[34:35], 0 7547; VI-NEXT: .LBB124_1: ; %atomicrmw.start 7548; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7549; VI-NEXT: s_waitcnt vmcnt(0) 7550; VI-NEXT: v_min_i32_e32 v2, s6, v3 7551; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7552; VI-NEXT: s_waitcnt vmcnt(0) 7553; VI-NEXT: buffer_wbinvl1_vol 7554; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7555; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7556; VI-NEXT: v_mov_b32_e32 v3, v2 7557; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7558; VI-NEXT: s_cbranch_execnz .LBB124_1 7559; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7560; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7561; VI-NEXT: s_setpc_b64 s[30:31] 7562; 7563; GFX9-LABEL: global_atomic_min_i32_noret_scalar: 7564; GFX9: ; %bb.0: 7565; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7566; GFX9-NEXT: v_mov_b32_e32 v2, 0 7567; GFX9-NEXT: global_load_dword v1, v2, s[4:5] 7568; GFX9-NEXT: s_mov_b64 s[34:35], 0 7569; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start 7570; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7571; GFX9-NEXT: s_waitcnt vmcnt(0) 7572; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 7573; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc 7574; GFX9-NEXT: s_waitcnt vmcnt(0) 7575; GFX9-NEXT: buffer_wbinvl1_vol 7576; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7577; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7578; GFX9-NEXT: v_mov_b32_e32 v1, v0 7579; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7580; GFX9-NEXT: s_cbranch_execnz .LBB124_1 7581; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7582; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7583; GFX9-NEXT: s_setpc_b64 s[30:31] 7584 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst 7585 ret void 7586} 7587 7588define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 7589; SI-LABEL: global_atomic_min_i32_noret_offset_scalar: 7590; SI: ; %bb.0: 7591; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7592; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7593; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill 7594; SI-NEXT: s_mov_b64 exec, s[34:35] 7595; SI-NEXT: s_waitcnt expcnt(0) 7596; SI-NEXT: v_writelane_b32 v4, s6, 0 7597; SI-NEXT: v_writelane_b32 v4, s7, 1 7598; SI-NEXT: s_mov_b32 s34, s6 7599; SI-NEXT: s_mov_b32 s7, 0xf000 7600; SI-NEXT: s_mov_b32 s6, -1 7601; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 7602; SI-NEXT: s_mov_b64 s[36:37], 0 7603; SI-NEXT: .LBB125_1: ; %atomicrmw.start 7604; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7605; SI-NEXT: s_waitcnt vmcnt(0) 7606; SI-NEXT: v_min_i32_e32 v0, s34, v1 7607; SI-NEXT: s_waitcnt expcnt(0) 7608; SI-NEXT: v_mov_b32_e32 v3, v1 7609; SI-NEXT: v_mov_b32_e32 v2, v0 7610; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 7611; SI-NEXT: s_waitcnt vmcnt(0) 7612; SI-NEXT: buffer_wbinvl1 7613; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 7614; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7615; SI-NEXT: v_mov_b32_e32 v1, v2 7616; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7617; SI-NEXT: s_cbranch_execnz .LBB125_1 7618; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7619; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7620; SI-NEXT: v_readlane_b32 s7, v4, 1 7621; SI-NEXT: v_readlane_b32 s6, v4, 0 7622; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7623; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload 7624; SI-NEXT: s_mov_b64 exec, s[34:35] 7625; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7626; SI-NEXT: s_setpc_b64 s[30:31] 7627; 7628; VI-LABEL: global_atomic_min_i32_noret_offset_scalar: 7629; VI: ; %bb.0: 7630; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7631; VI-NEXT: s_add_u32 s34, s4, 16 7632; VI-NEXT: s_addc_u32 s35, s5, 0 7633; VI-NEXT: v_mov_b32_e32 v0, s34 7634; VI-NEXT: v_mov_b32_e32 v1, s35 7635; VI-NEXT: flat_load_dword v3, v[0:1] 7636; VI-NEXT: s_mov_b64 s[34:35], 0 7637; VI-NEXT: .LBB125_1: ; %atomicrmw.start 7638; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7639; VI-NEXT: s_waitcnt vmcnt(0) 7640; VI-NEXT: v_min_i32_e32 v2, s6, v3 7641; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7642; VI-NEXT: s_waitcnt vmcnt(0) 7643; VI-NEXT: buffer_wbinvl1_vol 7644; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7645; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7646; VI-NEXT: v_mov_b32_e32 v3, v2 7647; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7648; VI-NEXT: s_cbranch_execnz .LBB125_1 7649; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7650; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7651; VI-NEXT: s_setpc_b64 s[30:31] 7652; 7653; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar: 7654; GFX9: ; %bb.0: 7655; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7656; GFX9-NEXT: v_mov_b32_e32 v2, 0 7657; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 7658; GFX9-NEXT: s_mov_b64 s[34:35], 0 7659; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start 7660; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7661; GFX9-NEXT: s_waitcnt vmcnt(0) 7662; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 7663; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc 7664; GFX9-NEXT: s_waitcnt vmcnt(0) 7665; GFX9-NEXT: buffer_wbinvl1_vol 7666; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7667; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7668; GFX9-NEXT: v_mov_b32_e32 v1, v0 7669; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7670; GFX9-NEXT: s_cbranch_execnz .LBB125_1 7671; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7672; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7673; GFX9-NEXT: s_setpc_b64 s[30:31] 7674 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 7675 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst 7676 ret void 7677} 7678 7679define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 7680; SI-LABEL: global_atomic_min_i32_ret_scalar: 7681; SI: ; %bb.0: 7682; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7683; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7684; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 7685; SI-NEXT: s_mov_b64 exec, s[34:35] 7686; SI-NEXT: s_waitcnt expcnt(0) 7687; SI-NEXT: v_writelane_b32 v3, s6, 0 7688; SI-NEXT: v_writelane_b32 v3, s7, 1 7689; SI-NEXT: s_mov_b32 s34, s6 7690; SI-NEXT: s_mov_b32 s7, 0xf000 7691; SI-NEXT: s_mov_b32 s6, -1 7692; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 7693; SI-NEXT: s_mov_b64 s[36:37], 0 7694; SI-NEXT: .LBB126_1: ; %atomicrmw.start 7695; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7696; SI-NEXT: s_waitcnt vmcnt(0) 7697; SI-NEXT: v_mov_b32_e32 v2, v0 7698; SI-NEXT: s_waitcnt expcnt(0) 7699; SI-NEXT: v_min_i32_e32 v1, s34, v2 7700; SI-NEXT: v_mov_b32_e32 v0, v1 7701; SI-NEXT: v_mov_b32_e32 v1, v2 7702; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc 7703; SI-NEXT: s_waitcnt vmcnt(0) 7704; SI-NEXT: buffer_wbinvl1 7705; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 7706; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7707; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7708; SI-NEXT: s_cbranch_execnz .LBB126_1 7709; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7710; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7711; SI-NEXT: v_readlane_b32 s7, v3, 1 7712; SI-NEXT: v_readlane_b32 s6, v3, 0 7713; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7714; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 7715; SI-NEXT: s_mov_b64 exec, s[34:35] 7716; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7717; SI-NEXT: s_setpc_b64 s[30:31] 7718; 7719; VI-LABEL: global_atomic_min_i32_ret_scalar: 7720; VI: ; %bb.0: 7721; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7722; VI-NEXT: v_mov_b32_e32 v0, s4 7723; VI-NEXT: v_mov_b32_e32 v1, s5 7724; VI-NEXT: flat_load_dword v0, v[0:1] 7725; VI-NEXT: v_mov_b32_e32 v1, s4 7726; VI-NEXT: s_mov_b64 s[34:35], 0 7727; VI-NEXT: v_mov_b32_e32 v2, s5 7728; VI-NEXT: .LBB126_1: ; %atomicrmw.start 7729; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7730; VI-NEXT: s_waitcnt vmcnt(0) 7731; VI-NEXT: v_mov_b32_e32 v4, v0 7732; VI-NEXT: v_min_i32_e32 v3, s6, v4 7733; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 7734; VI-NEXT: s_waitcnt vmcnt(0) 7735; VI-NEXT: buffer_wbinvl1_vol 7736; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 7737; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7738; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7739; VI-NEXT: s_cbranch_execnz .LBB126_1 7740; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7741; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7742; VI-NEXT: s_setpc_b64 s[30:31] 7743; 7744; GFX9-LABEL: global_atomic_min_i32_ret_scalar: 7745; GFX9: ; %bb.0: 7746; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7747; GFX9-NEXT: v_mov_b32_e32 v1, 0 7748; GFX9-NEXT: global_load_dword v0, v1, s[4:5] 7749; GFX9-NEXT: s_mov_b64 s[34:35], 0 7750; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start 7751; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7752; GFX9-NEXT: s_waitcnt vmcnt(0) 7753; GFX9-NEXT: v_mov_b32_e32 v3, v0 7754; GFX9-NEXT: v_min_i32_e32 v2, s6, v3 7755; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc 7756; GFX9-NEXT: s_waitcnt vmcnt(0) 7757; GFX9-NEXT: buffer_wbinvl1_vol 7758; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 7759; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7760; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7761; GFX9-NEXT: s_cbranch_execnz .LBB126_1 7762; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7763; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7764; GFX9-NEXT: s_setpc_b64 s[30:31] 7765 %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst 7766 ret i32 %result 7767} 7768 7769define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 7770; SI-LABEL: global_atomic_min_i32_ret_offset_scalar: 7771; SI: ; %bb.0: 7772; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7773; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7774; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill 7775; SI-NEXT: s_mov_b64 exec, s[34:35] 7776; SI-NEXT: s_waitcnt expcnt(0) 7777; SI-NEXT: v_writelane_b32 v3, s6, 0 7778; SI-NEXT: v_writelane_b32 v3, s7, 1 7779; SI-NEXT: s_mov_b32 s34, s6 7780; SI-NEXT: s_mov_b32 s7, 0xf000 7781; SI-NEXT: s_mov_b32 s6, -1 7782; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 7783; SI-NEXT: s_mov_b64 s[36:37], 0 7784; SI-NEXT: .LBB127_1: ; %atomicrmw.start 7785; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7786; SI-NEXT: s_waitcnt vmcnt(0) 7787; SI-NEXT: v_mov_b32_e32 v2, v0 7788; SI-NEXT: s_waitcnt expcnt(0) 7789; SI-NEXT: v_min_i32_e32 v1, s34, v2 7790; SI-NEXT: v_mov_b32_e32 v0, v1 7791; SI-NEXT: v_mov_b32_e32 v1, v2 7792; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 7793; SI-NEXT: s_waitcnt vmcnt(0) 7794; SI-NEXT: buffer_wbinvl1 7795; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 7796; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7797; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7798; SI-NEXT: s_cbranch_execnz .LBB127_1 7799; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7800; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7801; SI-NEXT: v_readlane_b32 s7, v3, 1 7802; SI-NEXT: v_readlane_b32 s6, v3, 0 7803; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7804; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload 7805; SI-NEXT: s_mov_b64 exec, s[34:35] 7806; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7807; SI-NEXT: s_setpc_b64 s[30:31] 7808; 7809; VI-LABEL: global_atomic_min_i32_ret_offset_scalar: 7810; VI: ; %bb.0: 7811; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7812; VI-NEXT: s_add_u32 s34, s4, 16 7813; VI-NEXT: s_addc_u32 s35, s5, 0 7814; VI-NEXT: v_mov_b32_e32 v1, s34 7815; VI-NEXT: v_mov_b32_e32 v2, s35 7816; VI-NEXT: flat_load_dword v0, v[1:2] 7817; VI-NEXT: s_mov_b64 s[34:35], 0 7818; VI-NEXT: .LBB127_1: ; %atomicrmw.start 7819; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7820; VI-NEXT: s_waitcnt vmcnt(0) 7821; VI-NEXT: v_mov_b32_e32 v4, v0 7822; VI-NEXT: v_min_i32_e32 v3, s6, v4 7823; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 7824; VI-NEXT: s_waitcnt vmcnt(0) 7825; VI-NEXT: buffer_wbinvl1_vol 7826; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 7827; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7828; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7829; VI-NEXT: s_cbranch_execnz .LBB127_1 7830; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7831; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7832; VI-NEXT: s_setpc_b64 s[30:31] 7833; 7834; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar: 7835; GFX9: ; %bb.0: 7836; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7837; GFX9-NEXT: v_mov_b32_e32 v1, 0 7838; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 7839; GFX9-NEXT: s_mov_b64 s[34:35], 0 7840; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start 7841; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7842; GFX9-NEXT: s_waitcnt vmcnt(0) 7843; GFX9-NEXT: v_mov_b32_e32 v3, v0 7844; GFX9-NEXT: v_min_i32_e32 v2, s6, v3 7845; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc 7846; GFX9-NEXT: s_waitcnt vmcnt(0) 7847; GFX9-NEXT: buffer_wbinvl1_vol 7848; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 7849; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7850; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7851; GFX9-NEXT: s_cbranch_execnz .LBB127_1 7852; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7853; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7854; GFX9-NEXT: s_setpc_b64 s[30:31] 7855 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 7856 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst 7857 ret i32 %result 7858} 7859 7860define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { 7861; SI-LABEL: atomic_min_i32_addr64_offset: 7862; SI: ; %bb.0: ; %entry 7863; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7864; SI-NEXT: s_waitcnt lgkmcnt(0) 7865; SI-NEXT: s_ashr_i32 s5, s3, 31 7866; SI-NEXT: s_mov_b32 s4, s3 7867; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7868; SI-NEXT: s_add_u32 s4, s0, s4 7869; SI-NEXT: s_addc_u32 s5, s1, s5 7870; SI-NEXT: s_load_dword s3, s[4:5], 0x4 7871; SI-NEXT: s_mov_b64 s[0:1], 0 7872; SI-NEXT: s_mov_b32 s7, 0xf000 7873; SI-NEXT: s_waitcnt lgkmcnt(0) 7874; SI-NEXT: v_mov_b32_e32 v1, s3 7875; SI-NEXT: s_mov_b32 s6, -1 7876; SI-NEXT: .LBB128_1: ; %atomicrmw.start 7877; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7878; SI-NEXT: v_min_i32_e32 v0, s2, v1 7879; SI-NEXT: s_waitcnt expcnt(0) 7880; SI-NEXT: v_mov_b32_e32 v3, v1 7881; SI-NEXT: v_mov_b32_e32 v2, v0 7882; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 7883; SI-NEXT: s_waitcnt vmcnt(0) 7884; SI-NEXT: buffer_wbinvl1 7885; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 7886; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7887; SI-NEXT: v_mov_b32_e32 v1, v2 7888; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 7889; SI-NEXT: s_cbranch_execnz .LBB128_1 7890; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7891; SI-NEXT: s_endpgm 7892; 7893; VI-LABEL: atomic_min_i32_addr64_offset: 7894; VI: ; %bb.0: ; %entry 7895; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7896; VI-NEXT: s_waitcnt lgkmcnt(0) 7897; VI-NEXT: s_ashr_i32 s5, s3, 31 7898; VI-NEXT: s_mov_b32 s4, s3 7899; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7900; VI-NEXT: s_add_u32 s4, s0, s4 7901; VI-NEXT: s_addc_u32 s5, s1, s5 7902; VI-NEXT: s_load_dword s3, s[4:5], 0x10 7903; VI-NEXT: s_add_u32 s4, s4, 16 7904; VI-NEXT: s_addc_u32 s5, s5, 0 7905; VI-NEXT: v_mov_b32_e32 v0, s4 7906; VI-NEXT: s_mov_b64 s[0:1], 0 7907; VI-NEXT: s_waitcnt lgkmcnt(0) 7908; VI-NEXT: v_mov_b32_e32 v3, s3 7909; VI-NEXT: v_mov_b32_e32 v1, s5 7910; VI-NEXT: .LBB128_1: ; %atomicrmw.start 7911; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7912; VI-NEXT: v_min_i32_e32 v2, s2, v3 7913; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7914; VI-NEXT: s_waitcnt vmcnt(0) 7915; VI-NEXT: buffer_wbinvl1_vol 7916; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7917; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7918; VI-NEXT: v_mov_b32_e32 v3, v2 7919; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 7920; VI-NEXT: s_cbranch_execnz .LBB128_1 7921; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7922; VI-NEXT: s_endpgm 7923; 7924; GFX9-LABEL: atomic_min_i32_addr64_offset: 7925; GFX9: ; %bb.0: ; %entry 7926; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7927; GFX9-NEXT: v_mov_b32_e32 v2, 0 7928; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7929; GFX9-NEXT: s_ashr_i32 s5, s3, 31 7930; GFX9-NEXT: s_mov_b32 s4, s3 7931; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7932; GFX9-NEXT: s_add_u32 s0, s0, s4 7933; GFX9-NEXT: s_addc_u32 s1, s1, s5 7934; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 7935; GFX9-NEXT: s_mov_b64 s[4:5], 0 7936; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7937; GFX9-NEXT: v_mov_b32_e32 v1, s3 7938; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start 7939; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7940; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 7941; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc 7942; GFX9-NEXT: s_waitcnt vmcnt(0) 7943; GFX9-NEXT: buffer_wbinvl1_vol 7944; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7945; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7946; GFX9-NEXT: v_mov_b32_e32 v1, v0 7947; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7948; GFX9-NEXT: s_cbranch_execnz .LBB128_1 7949; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7950; GFX9-NEXT: s_endpgm 7951entry: 7952 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 7953 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 7954 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst 7955 ret void 7956} 7957 7958define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { 7959; SI-LABEL: atomic_min_i32_ret_addr64_offset: 7960; SI: ; %bb.0: ; %entry 7961; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 7962; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7963; SI-NEXT: s_waitcnt lgkmcnt(0) 7964; SI-NEXT: s_ashr_i32 s5, s9, 31 7965; SI-NEXT: s_mov_b32 s4, s9 7966; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7967; SI-NEXT: s_add_u32 s4, s0, s4 7968; SI-NEXT: s_addc_u32 s5, s1, s5 7969; SI-NEXT: s_load_dword s6, s[4:5], 0x4 7970; SI-NEXT: s_mov_b64 s[0:1], 0 7971; SI-NEXT: s_mov_b32 s7, 0xf000 7972; SI-NEXT: s_waitcnt lgkmcnt(0) 7973; SI-NEXT: v_mov_b32_e32 v1, s6 7974; SI-NEXT: s_mov_b32 s6, -1 7975; SI-NEXT: .LBB129_1: ; %atomicrmw.start 7976; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7977; SI-NEXT: v_min_i32_e32 v0, s8, v1 7978; SI-NEXT: s_waitcnt expcnt(0) 7979; SI-NEXT: v_mov_b32_e32 v3, v1 7980; SI-NEXT: v_mov_b32_e32 v2, v0 7981; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc 7982; SI-NEXT: s_waitcnt vmcnt(0) 7983; SI-NEXT: buffer_wbinvl1 7984; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 7985; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7986; SI-NEXT: v_mov_b32_e32 v1, v2 7987; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 7988; SI-NEXT: s_cbranch_execnz .LBB129_1 7989; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7990; SI-NEXT: s_or_b64 exec, exec, s[0:1] 7991; SI-NEXT: s_mov_b32 s7, 0xf000 7992; SI-NEXT: s_mov_b32 s6, -1 7993; SI-NEXT: s_mov_b32 s4, s2 7994; SI-NEXT: s_mov_b32 s5, s3 7995; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 7996; SI-NEXT: s_endpgm 7997; 7998; VI-LABEL: atomic_min_i32_ret_addr64_offset: 7999; VI: ; %bb.0: ; %entry 8000; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 8001; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8002; VI-NEXT: s_waitcnt lgkmcnt(0) 8003; VI-NEXT: s_ashr_i32 s5, s7, 31 8004; VI-NEXT: s_mov_b32 s4, s7 8005; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 8006; VI-NEXT: s_add_u32 s4, s0, s4 8007; VI-NEXT: s_addc_u32 s5, s1, s5 8008; VI-NEXT: s_load_dword s7, s[4:5], 0x10 8009; VI-NEXT: s_add_u32 s4, s4, 16 8010; VI-NEXT: s_addc_u32 s5, s5, 0 8011; VI-NEXT: v_mov_b32_e32 v0, s4 8012; VI-NEXT: s_mov_b64 s[0:1], 0 8013; VI-NEXT: s_waitcnt lgkmcnt(0) 8014; VI-NEXT: v_mov_b32_e32 v2, s7 8015; VI-NEXT: v_mov_b32_e32 v1, s5 8016; VI-NEXT: .LBB129_1: ; %atomicrmw.start 8017; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8018; VI-NEXT: v_mov_b32_e32 v3, v2 8019; VI-NEXT: v_min_i32_e32 v2, s6, v3 8020; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8021; VI-NEXT: s_waitcnt vmcnt(0) 8022; VI-NEXT: buffer_wbinvl1_vol 8023; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8024; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8025; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 8026; VI-NEXT: s_cbranch_execnz .LBB129_1 8027; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8028; VI-NEXT: s_or_b64 exec, exec, s[0:1] 8029; VI-NEXT: v_mov_b32_e32 v0, s2 8030; VI-NEXT: v_mov_b32_e32 v1, s3 8031; VI-NEXT: flat_store_dword v[0:1], v2 8032; VI-NEXT: s_endpgm 8033; 8034; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: 8035; GFX9: ; %bb.0: ; %entry 8036; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 8037; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8038; GFX9-NEXT: v_mov_b32_e32 v1, 0 8039; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8040; GFX9-NEXT: s_ashr_i32 s5, s7, 31 8041; GFX9-NEXT: s_mov_b32 s4, s7 8042; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 8043; GFX9-NEXT: s_add_u32 s0, s0, s4 8044; GFX9-NEXT: s_addc_u32 s1, s1, s5 8045; GFX9-NEXT: s_load_dword s7, s[0:1], 0x10 8046; GFX9-NEXT: s_mov_b64 s[4:5], 0 8047; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8048; GFX9-NEXT: v_mov_b32_e32 v0, s7 8049; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start 8050; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8051; GFX9-NEXT: v_mov_b32_e32 v3, v0 8052; GFX9-NEXT: v_min_i32_e32 v2, s6, v3 8053; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc 8054; GFX9-NEXT: s_waitcnt vmcnt(0) 8055; GFX9-NEXT: buffer_wbinvl1_vol 8056; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 8057; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8058; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8059; GFX9-NEXT: s_cbranch_execnz .LBB129_1 8060; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8061; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8062; GFX9-NEXT: v_mov_b32_e32 v1, 0 8063; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 8064; GFX9-NEXT: s_endpgm 8065entry: 8066 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 8067 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 8068 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst 8069 store i32 %tmp0, ptr addrspace(1) %out2 8070 ret void 8071} 8072 8073define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { 8074; SI-LABEL: atomic_min_i32: 8075; SI: ; %bb.0: ; %entry 8076; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 8077; SI-NEXT: s_load_dword s6, s[4:5], 0xb 8078; SI-NEXT: s_waitcnt lgkmcnt(0) 8079; SI-NEXT: s_load_dword s2, s[0:1], 0x0 8080; SI-NEXT: s_mov_b64 s[4:5], 0 8081; SI-NEXT: s_mov_b32 s3, 0xf000 8082; SI-NEXT: s_waitcnt lgkmcnt(0) 8083; SI-NEXT: v_mov_b32_e32 v1, s2 8084; SI-NEXT: s_mov_b32 s2, -1 8085; SI-NEXT: .LBB130_1: ; %atomicrmw.start 8086; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8087; SI-NEXT: v_min_i32_e32 v0, s6, v1 8088; SI-NEXT: s_waitcnt expcnt(0) 8089; SI-NEXT: v_mov_b32_e32 v3, v1 8090; SI-NEXT: v_mov_b32_e32 v2, v0 8091; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc 8092; SI-NEXT: s_waitcnt vmcnt(0) 8093; SI-NEXT: buffer_wbinvl1 8094; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 8095; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8096; SI-NEXT: v_mov_b32_e32 v1, v2 8097; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8098; SI-NEXT: s_cbranch_execnz .LBB130_1 8099; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8100; SI-NEXT: s_endpgm 8101; 8102; VI-LABEL: atomic_min_i32: 8103; VI: ; %bb.0: ; %entry 8104; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 8105; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 8106; VI-NEXT: s_mov_b64 s[0:1], 0 8107; VI-NEXT: s_waitcnt lgkmcnt(0) 8108; VI-NEXT: s_load_dword s3, s[6:7], 0x0 8109; VI-NEXT: v_mov_b32_e32 v0, s6 8110; VI-NEXT: v_mov_b32_e32 v1, s7 8111; VI-NEXT: s_waitcnt lgkmcnt(0) 8112; VI-NEXT: v_mov_b32_e32 v3, s3 8113; VI-NEXT: .LBB130_1: ; %atomicrmw.start 8114; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8115; VI-NEXT: v_min_i32_e32 v2, s2, v3 8116; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8117; VI-NEXT: s_waitcnt vmcnt(0) 8118; VI-NEXT: buffer_wbinvl1_vol 8119; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8120; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8121; VI-NEXT: v_mov_b32_e32 v3, v2 8122; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 8123; VI-NEXT: s_cbranch_execnz .LBB130_1 8124; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8125; VI-NEXT: s_endpgm 8126; 8127; GFX9-LABEL: atomic_min_i32: 8128; GFX9: ; %bb.0: ; %entry 8129; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8130; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 8131; GFX9-NEXT: s_mov_b64 s[2:3], 0 8132; GFX9-NEXT: v_mov_b32_e32 v2, 0 8133; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8134; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 8135; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8136; GFX9-NEXT: v_mov_b32_e32 v1, s4 8137; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start 8138; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8139; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 8140; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 8141; GFX9-NEXT: s_waitcnt vmcnt(0) 8142; GFX9-NEXT: buffer_wbinvl1_vol 8143; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 8144; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 8145; GFX9-NEXT: v_mov_b32_e32 v1, v0 8146; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 8147; GFX9-NEXT: s_cbranch_execnz .LBB130_1 8148; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8149; GFX9-NEXT: s_endpgm 8150entry: 8151 %tmp0 = atomicrmw min ptr addrspace(1) %out, i32 %in seq_cst 8152 ret void 8153} 8154 8155define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { 8156; SI-LABEL: atomic_min_i32_ret_addr64: 8157; SI: ; %bb.0: ; %entry 8158; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 8159; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8160; SI-NEXT: s_waitcnt lgkmcnt(0) 8161; SI-NEXT: s_ashr_i32 s5, s9, 31 8162; SI-NEXT: s_mov_b32 s4, s9 8163; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 8164; SI-NEXT: s_add_u32 s4, s0, s4 8165; SI-NEXT: s_addc_u32 s5, s1, s5 8166; SI-NEXT: s_load_dword s6, s[4:5], 0x0 8167; SI-NEXT: s_mov_b64 s[0:1], 0 8168; SI-NEXT: s_mov_b32 s7, 0xf000 8169; SI-NEXT: s_waitcnt lgkmcnt(0) 8170; SI-NEXT: v_mov_b32_e32 v1, s6 8171; SI-NEXT: s_mov_b32 s6, -1 8172; SI-NEXT: .LBB131_1: ; %atomicrmw.start 8173; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8174; SI-NEXT: v_min_i32_e32 v0, s8, v1 8175; SI-NEXT: s_waitcnt expcnt(0) 8176; SI-NEXT: v_mov_b32_e32 v3, v1 8177; SI-NEXT: v_mov_b32_e32 v2, v0 8178; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc 8179; SI-NEXT: s_waitcnt vmcnt(0) 8180; SI-NEXT: buffer_wbinvl1 8181; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 8182; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8183; SI-NEXT: v_mov_b32_e32 v1, v2 8184; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 8185; SI-NEXT: s_cbranch_execnz .LBB131_1 8186; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8187; SI-NEXT: s_or_b64 exec, exec, s[0:1] 8188; SI-NEXT: s_mov_b32 s7, 0xf000 8189; SI-NEXT: s_mov_b32 s6, -1 8190; SI-NEXT: s_mov_b32 s4, s2 8191; SI-NEXT: s_mov_b32 s5, s3 8192; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 8193; SI-NEXT: s_endpgm 8194; 8195; VI-LABEL: atomic_min_i32_ret_addr64: 8196; VI: ; %bb.0: ; %entry 8197; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 8198; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8199; VI-NEXT: s_waitcnt lgkmcnt(0) 8200; VI-NEXT: s_ashr_i32 s5, s7, 31 8201; VI-NEXT: s_mov_b32 s4, s7 8202; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 8203; VI-NEXT: s_add_u32 s4, s0, s4 8204; VI-NEXT: s_addc_u32 s5, s1, s5 8205; VI-NEXT: s_load_dword s7, s[4:5], 0x0 8206; VI-NEXT: v_mov_b32_e32 v0, s4 8207; VI-NEXT: s_mov_b64 s[0:1], 0 8208; VI-NEXT: v_mov_b32_e32 v1, s5 8209; VI-NEXT: s_waitcnt lgkmcnt(0) 8210; VI-NEXT: v_mov_b32_e32 v2, s7 8211; VI-NEXT: .LBB131_1: ; %atomicrmw.start 8212; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8213; VI-NEXT: v_mov_b32_e32 v3, v2 8214; VI-NEXT: v_min_i32_e32 v2, s6, v3 8215; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8216; VI-NEXT: s_waitcnt vmcnt(0) 8217; VI-NEXT: buffer_wbinvl1_vol 8218; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8219; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8220; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 8221; VI-NEXT: s_cbranch_execnz .LBB131_1 8222; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8223; VI-NEXT: s_or_b64 exec, exec, s[0:1] 8224; VI-NEXT: v_mov_b32_e32 v0, s2 8225; VI-NEXT: v_mov_b32_e32 v1, s3 8226; VI-NEXT: flat_store_dword v[0:1], v2 8227; VI-NEXT: s_endpgm 8228; 8229; GFX9-LABEL: atomic_min_i32_ret_addr64: 8230; GFX9: ; %bb.0: ; %entry 8231; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 8232; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8233; GFX9-NEXT: v_mov_b32_e32 v1, 0 8234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8235; GFX9-NEXT: s_ashr_i32 s5, s7, 31 8236; GFX9-NEXT: s_mov_b32 s4, s7 8237; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 8238; GFX9-NEXT: s_add_u32 s0, s0, s4 8239; GFX9-NEXT: s_addc_u32 s1, s1, s5 8240; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 8241; GFX9-NEXT: s_mov_b64 s[4:5], 0 8242; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8243; GFX9-NEXT: v_mov_b32_e32 v0, s7 8244; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start 8245; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8246; GFX9-NEXT: v_mov_b32_e32 v3, v0 8247; GFX9-NEXT: v_min_i32_e32 v2, s6, v3 8248; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc 8249; GFX9-NEXT: s_waitcnt vmcnt(0) 8250; GFX9-NEXT: buffer_wbinvl1_vol 8251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 8252; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8253; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8254; GFX9-NEXT: s_cbranch_execnz .LBB131_1 8255; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8256; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8257; GFX9-NEXT: v_mov_b32_e32 v1, 0 8258; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 8259; GFX9-NEXT: s_endpgm 8260entry: 8261 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index 8262 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst 8263 store i32 %tmp0, ptr addrspace(1) %out2 8264 ret void 8265} 8266 8267define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 8268; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory: 8269; SI: ; %bb.0: 8270; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8271; SI-NEXT: s_mov_b32 s6, 0 8272; SI-NEXT: s_mov_b32 s7, 0xf000 8273; SI-NEXT: s_mov_b32 s4, s6 8274; SI-NEXT: s_mov_b32 s5, s6 8275; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 8276; SI-NEXT: s_mov_b64 s[8:9], 0 8277; SI-NEXT: .LBB132_1: ; %atomicrmw.start 8278; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8279; SI-NEXT: s_waitcnt vmcnt(0) 8280; SI-NEXT: v_min_i32_e32 v3, v4, v2 8281; SI-NEXT: s_waitcnt expcnt(0) 8282; SI-NEXT: v_mov_b32_e32 v6, v4 8283; SI-NEXT: v_mov_b32_e32 v5, v3 8284; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc 8285; SI-NEXT: s_waitcnt vmcnt(0) 8286; SI-NEXT: buffer_wbinvl1 8287; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 8288; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 8289; SI-NEXT: v_mov_b32_e32 v4, v5 8290; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 8291; SI-NEXT: s_cbranch_execnz .LBB132_1 8292; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8293; SI-NEXT: s_or_b64 exec, exec, s[8:9] 8294; SI-NEXT: s_waitcnt expcnt(0) 8295; SI-NEXT: s_setpc_b64 s[30:31] 8296; 8297; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory: 8298; VI: ; %bb.0: 8299; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8300; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8301; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8302; VI-NEXT: flat_load_dword v4, v[0:1] 8303; VI-NEXT: s_mov_b64 s[4:5], 0 8304; VI-NEXT: .LBB132_1: ; %atomicrmw.start 8305; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8306; VI-NEXT: s_waitcnt vmcnt(0) 8307; VI-NEXT: v_min_i32_e32 v3, v4, v2 8308; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 8309; VI-NEXT: s_waitcnt vmcnt(0) 8310; VI-NEXT: buffer_wbinvl1_vol 8311; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 8312; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8313; VI-NEXT: v_mov_b32_e32 v4, v3 8314; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8315; VI-NEXT: s_cbranch_execnz .LBB132_1 8316; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8317; VI-NEXT: s_or_b64 exec, exec, s[4:5] 8318; VI-NEXT: s_setpc_b64 s[30:31] 8319; 8320; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory: 8321; GFX9: ; %bb.0: 8322; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8323; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 8324; GFX9-NEXT: s_mov_b64 s[4:5], 0 8325; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start 8326; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8327; GFX9-NEXT: s_waitcnt vmcnt(0) 8328; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 8329; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 8330; GFX9-NEXT: s_waitcnt vmcnt(0) 8331; GFX9-NEXT: buffer_wbinvl1_vol 8332; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 8333; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8334; GFX9-NEXT: v_mov_b32_e32 v4, v3 8335; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8336; GFX9-NEXT: s_cbranch_execnz .LBB132_1 8337; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8338; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8339; GFX9-NEXT: s_setpc_b64 s[30:31] 8340 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 8341 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 8342 ret void 8343} 8344 8345define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 8346; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: 8347; SI: ; %bb.0: 8348; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8349; SI-NEXT: s_mov_b32 s6, 0 8350; SI-NEXT: s_mov_b32 s7, 0xf000 8351; SI-NEXT: s_mov_b32 s4, s6 8352; SI-NEXT: s_mov_b32 s5, s6 8353; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 8354; SI-NEXT: s_mov_b64 s[8:9], 0 8355; SI-NEXT: .LBB133_1: ; %atomicrmw.start 8356; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8357; SI-NEXT: s_waitcnt vmcnt(0) 8358; SI-NEXT: v_mov_b32_e32 v5, v3 8359; SI-NEXT: s_waitcnt expcnt(0) 8360; SI-NEXT: v_min_i32_e32 v4, v5, v2 8361; SI-NEXT: v_mov_b32_e32 v3, v4 8362; SI-NEXT: v_mov_b32_e32 v4, v5 8363; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc 8364; SI-NEXT: s_waitcnt vmcnt(0) 8365; SI-NEXT: buffer_wbinvl1 8366; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 8367; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 8368; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 8369; SI-NEXT: s_cbranch_execnz .LBB133_1 8370; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8371; SI-NEXT: s_or_b64 exec, exec, s[8:9] 8372; SI-NEXT: v_mov_b32_e32 v0, v3 8373; SI-NEXT: s_waitcnt expcnt(0) 8374; SI-NEXT: s_setpc_b64 s[30:31] 8375; 8376; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: 8377; VI: ; %bb.0: 8378; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8379; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 8380; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 8381; VI-NEXT: flat_load_dword v0, v[3:4] 8382; VI-NEXT: s_mov_b64 s[4:5], 0 8383; VI-NEXT: .LBB133_1: ; %atomicrmw.start 8384; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8385; VI-NEXT: s_waitcnt vmcnt(0) 8386; VI-NEXT: v_mov_b32_e32 v1, v0 8387; VI-NEXT: v_min_i32_e32 v0, v1, v2 8388; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 8389; VI-NEXT: s_waitcnt vmcnt(0) 8390; VI-NEXT: buffer_wbinvl1_vol 8391; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 8392; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8393; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8394; VI-NEXT: s_cbranch_execnz .LBB133_1 8395; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8396; VI-NEXT: s_or_b64 exec, exec, s[4:5] 8397; VI-NEXT: s_setpc_b64 s[30:31] 8398; 8399; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: 8400; GFX9: ; %bb.0: 8401; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8402; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 8403; GFX9-NEXT: s_mov_b64 s[4:5], 0 8404; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start 8405; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8406; GFX9-NEXT: s_waitcnt vmcnt(0) 8407; GFX9-NEXT: v_mov_b32_e32 v4, v3 8408; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 8409; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc 8410; GFX9-NEXT: s_waitcnt vmcnt(0) 8411; GFX9-NEXT: buffer_wbinvl1_vol 8412; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 8413; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8414; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8415; GFX9-NEXT: s_cbranch_execnz .LBB133_1 8416; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8417; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8418; GFX9-NEXT: v_mov_b32_e32 v0, v3 8419; GFX9-NEXT: s_setpc_b64 s[30:31] 8420 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 8421 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 8422 ret i32 %result 8423} 8424 8425; --------------------------------------------------------------------- 8426; atomicrmw uinc_wrap 8427; --------------------------------------------------------------------- 8428 8429define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 8430; SI-LABEL: global_atomic_uinc_wrap_i32_noret: 8431; SI: ; %bb.0: 8432; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8433; SI-NEXT: s_mov_b32 s6, 0 8434; SI-NEXT: s_mov_b32 s7, 0xf000 8435; SI-NEXT: s_mov_b32 s4, s6 8436; SI-NEXT: s_mov_b32 s5, s6 8437; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 8438; SI-NEXT: s_waitcnt vmcnt(0) 8439; SI-NEXT: buffer_wbinvl1 8440; SI-NEXT: s_waitcnt expcnt(0) 8441; SI-NEXT: s_setpc_b64 s[30:31] 8442; 8443; VI-LABEL: global_atomic_uinc_wrap_i32_noret: 8444; VI: ; %bb.0: 8445; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8446; VI-NEXT: flat_atomic_inc v[0:1], v2 8447; VI-NEXT: s_waitcnt vmcnt(0) 8448; VI-NEXT: buffer_wbinvl1_vol 8449; VI-NEXT: s_setpc_b64 s[30:31] 8450; 8451; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret: 8452; GFX9: ; %bb.0: 8453; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8454; GFX9-NEXT: global_atomic_inc v[0:1], v2, off 8455; GFX9-NEXT: s_waitcnt vmcnt(0) 8456; GFX9-NEXT: buffer_wbinvl1_vol 8457; GFX9-NEXT: s_setpc_b64 s[30:31] 8458 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 8459 ret void 8460} 8461 8462define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 8463; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset: 8464; SI: ; %bb.0: 8465; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8466; SI-NEXT: s_mov_b32 s6, 0 8467; SI-NEXT: s_mov_b32 s7, 0xf000 8468; SI-NEXT: s_mov_b32 s4, s6 8469; SI-NEXT: s_mov_b32 s5, s6 8470; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 8471; SI-NEXT: s_waitcnt vmcnt(0) 8472; SI-NEXT: buffer_wbinvl1 8473; SI-NEXT: s_waitcnt expcnt(0) 8474; SI-NEXT: s_setpc_b64 s[30:31] 8475; 8476; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset: 8477; VI: ; %bb.0: 8478; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8479; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8480; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8481; VI-NEXT: flat_atomic_inc v[0:1], v2 8482; VI-NEXT: s_waitcnt vmcnt(0) 8483; VI-NEXT: buffer_wbinvl1_vol 8484; VI-NEXT: s_setpc_b64 s[30:31] 8485; 8486; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset: 8487; GFX9: ; %bb.0: 8488; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8489; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 8490; GFX9-NEXT: s_waitcnt vmcnt(0) 8491; GFX9-NEXT: buffer_wbinvl1_vol 8492; GFX9-NEXT: s_setpc_b64 s[30:31] 8493 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 8494 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst 8495 ret void 8496} 8497 8498define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 8499; SI-LABEL: global_atomic_uinc_wrap_i32_ret: 8500; SI: ; %bb.0: 8501; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8502; SI-NEXT: s_mov_b32 s6, 0 8503; SI-NEXT: s_mov_b32 s7, 0xf000 8504; SI-NEXT: s_mov_b32 s4, s6 8505; SI-NEXT: s_mov_b32 s5, s6 8506; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc 8507; SI-NEXT: s_waitcnt vmcnt(0) 8508; SI-NEXT: buffer_wbinvl1 8509; SI-NEXT: v_mov_b32_e32 v0, v2 8510; SI-NEXT: s_waitcnt expcnt(0) 8511; SI-NEXT: s_setpc_b64 s[30:31] 8512; 8513; VI-LABEL: global_atomic_uinc_wrap_i32_ret: 8514; VI: ; %bb.0: 8515; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8516; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 8517; VI-NEXT: s_waitcnt vmcnt(0) 8518; VI-NEXT: buffer_wbinvl1_vol 8519; VI-NEXT: s_setpc_b64 s[30:31] 8520; 8521; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret: 8522; GFX9: ; %bb.0: 8523; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8524; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc 8525; GFX9-NEXT: s_waitcnt vmcnt(0) 8526; GFX9-NEXT: buffer_wbinvl1_vol 8527; GFX9-NEXT: s_setpc_b64 s[30:31] 8528 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 8529 ret i32 %result 8530} 8531 8532define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 8533; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset: 8534; SI: ; %bb.0: 8535; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8536; SI-NEXT: s_mov_b32 s6, 0 8537; SI-NEXT: s_mov_b32 s7, 0xf000 8538; SI-NEXT: s_mov_b32 s4, s6 8539; SI-NEXT: s_mov_b32 s5, s6 8540; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 8541; SI-NEXT: s_waitcnt vmcnt(0) 8542; SI-NEXT: buffer_wbinvl1 8543; SI-NEXT: v_mov_b32_e32 v0, v2 8544; SI-NEXT: s_waitcnt expcnt(0) 8545; SI-NEXT: s_setpc_b64 s[30:31] 8546; 8547; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset: 8548; VI: ; %bb.0: 8549; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8550; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8551; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8552; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 8553; VI-NEXT: s_waitcnt vmcnt(0) 8554; VI-NEXT: buffer_wbinvl1_vol 8555; VI-NEXT: s_setpc_b64 s[30:31] 8556; 8557; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset: 8558; GFX9: ; %bb.0: 8559; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8560; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc 8561; GFX9-NEXT: s_waitcnt vmcnt(0) 8562; GFX9-NEXT: buffer_wbinvl1_vol 8563; GFX9-NEXT: s_setpc_b64 s[30:31] 8564 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 8565 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst 8566 ret i32 %result 8567} 8568 8569define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 8570; SI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: 8571; SI: ; %bb.0: 8572; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8573; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8574; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 8575; SI-NEXT: s_mov_b64 exec, s[34:35] 8576; SI-NEXT: s_waitcnt expcnt(0) 8577; SI-NEXT: v_writelane_b32 v1, s6, 0 8578; SI-NEXT: v_writelane_b32 v1, s7, 1 8579; SI-NEXT: s_mov_b32 s34, s6 8580; SI-NEXT: s_mov_b32 s7, 0xf000 8581; SI-NEXT: s_mov_b32 s6, -1 8582; SI-NEXT: v_mov_b32_e32 v0, s34 8583; SI-NEXT: s_waitcnt vmcnt(0) 8584; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 8585; SI-NEXT: s_waitcnt vmcnt(0) 8586; SI-NEXT: buffer_wbinvl1 8587; SI-NEXT: v_readlane_b32 s7, v1, 1 8588; SI-NEXT: v_readlane_b32 s6, v1, 0 8589; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8590; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 8591; SI-NEXT: s_mov_b64 exec, s[34:35] 8592; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8593; SI-NEXT: s_setpc_b64 s[30:31] 8594; 8595; VI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: 8596; VI: ; %bb.0: 8597; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8598; VI-NEXT: v_mov_b32_e32 v0, s4 8599; VI-NEXT: v_mov_b32_e32 v1, s5 8600; VI-NEXT: v_mov_b32_e32 v2, s6 8601; VI-NEXT: flat_atomic_inc v[0:1], v2 8602; VI-NEXT: s_waitcnt vmcnt(0) 8603; VI-NEXT: buffer_wbinvl1_vol 8604; VI-NEXT: s_setpc_b64 s[30:31] 8605; 8606; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: 8607; GFX9: ; %bb.0: 8608; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8609; GFX9-NEXT: v_mov_b32_e32 v0, 0 8610; GFX9-NEXT: v_mov_b32_e32 v1, s6 8611; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] 8612; GFX9-NEXT: s_waitcnt vmcnt(0) 8613; GFX9-NEXT: buffer_wbinvl1_vol 8614; GFX9-NEXT: s_setpc_b64 s[30:31] 8615 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 8616 ret void 8617} 8618 8619define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 8620; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: 8621; SI: ; %bb.0: 8622; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8623; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8624; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 8625; SI-NEXT: s_mov_b64 exec, s[34:35] 8626; SI-NEXT: s_waitcnt expcnt(0) 8627; SI-NEXT: v_writelane_b32 v1, s6, 0 8628; SI-NEXT: v_writelane_b32 v1, s7, 1 8629; SI-NEXT: s_mov_b32 s34, s6 8630; SI-NEXT: s_mov_b32 s7, 0xf000 8631; SI-NEXT: s_mov_b32 s6, -1 8632; SI-NEXT: v_mov_b32_e32 v0, s34 8633; SI-NEXT: s_waitcnt vmcnt(0) 8634; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 8635; SI-NEXT: s_waitcnt vmcnt(0) 8636; SI-NEXT: buffer_wbinvl1 8637; SI-NEXT: v_readlane_b32 s7, v1, 1 8638; SI-NEXT: v_readlane_b32 s6, v1, 0 8639; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8640; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 8641; SI-NEXT: s_mov_b64 exec, s[34:35] 8642; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8643; SI-NEXT: s_setpc_b64 s[30:31] 8644; 8645; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: 8646; VI: ; %bb.0: 8647; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8648; VI-NEXT: s_add_u32 s34, s4, 16 8649; VI-NEXT: s_addc_u32 s35, s5, 0 8650; VI-NEXT: v_mov_b32_e32 v0, s34 8651; VI-NEXT: v_mov_b32_e32 v1, s35 8652; VI-NEXT: v_mov_b32_e32 v2, s6 8653; VI-NEXT: flat_atomic_inc v[0:1], v2 8654; VI-NEXT: s_waitcnt vmcnt(0) 8655; VI-NEXT: buffer_wbinvl1_vol 8656; VI-NEXT: s_setpc_b64 s[30:31] 8657; 8658; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: 8659; GFX9: ; %bb.0: 8660; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8661; GFX9-NEXT: v_mov_b32_e32 v0, 0 8662; GFX9-NEXT: v_mov_b32_e32 v1, s6 8663; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16 8664; GFX9-NEXT: s_waitcnt vmcnt(0) 8665; GFX9-NEXT: buffer_wbinvl1_vol 8666; GFX9-NEXT: s_setpc_b64 s[30:31] 8667 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 8668 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst 8669 ret void 8670} 8671 8672define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 8673; SI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: 8674; SI: ; %bb.0: 8675; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8676; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8677; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 8678; SI-NEXT: s_mov_b64 exec, s[34:35] 8679; SI-NEXT: s_waitcnt expcnt(0) 8680; SI-NEXT: v_writelane_b32 v1, s6, 0 8681; SI-NEXT: v_writelane_b32 v1, s7, 1 8682; SI-NEXT: s_mov_b32 s34, s6 8683; SI-NEXT: s_mov_b32 s7, 0xf000 8684; SI-NEXT: s_mov_b32 s6, -1 8685; SI-NEXT: v_mov_b32_e32 v0, s34 8686; SI-NEXT: s_waitcnt vmcnt(0) 8687; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc 8688; SI-NEXT: s_waitcnt vmcnt(0) 8689; SI-NEXT: buffer_wbinvl1 8690; SI-NEXT: v_readlane_b32 s7, v1, 1 8691; SI-NEXT: v_readlane_b32 s6, v1, 0 8692; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8693; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 8694; SI-NEXT: s_mov_b64 exec, s[34:35] 8695; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8696; SI-NEXT: s_setpc_b64 s[30:31] 8697; 8698; VI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: 8699; VI: ; %bb.0: 8700; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8701; VI-NEXT: v_mov_b32_e32 v0, s4 8702; VI-NEXT: v_mov_b32_e32 v1, s5 8703; VI-NEXT: v_mov_b32_e32 v2, s6 8704; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 8705; VI-NEXT: s_waitcnt vmcnt(0) 8706; VI-NEXT: buffer_wbinvl1_vol 8707; VI-NEXT: s_setpc_b64 s[30:31] 8708; 8709; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: 8710; GFX9: ; %bb.0: 8711; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8712; GFX9-NEXT: v_mov_b32_e32 v0, 0 8713; GFX9-NEXT: v_mov_b32_e32 v1, s6 8714; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc 8715; GFX9-NEXT: s_waitcnt vmcnt(0) 8716; GFX9-NEXT: buffer_wbinvl1_vol 8717; GFX9-NEXT: s_setpc_b64 s[30:31] 8718 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 8719 ret i32 %result 8720} 8721 8722define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 8723; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: 8724; SI: ; %bb.0: 8725; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8726; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8727; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 8728; SI-NEXT: s_mov_b64 exec, s[34:35] 8729; SI-NEXT: s_waitcnt expcnt(0) 8730; SI-NEXT: v_writelane_b32 v1, s6, 0 8731; SI-NEXT: v_writelane_b32 v1, s7, 1 8732; SI-NEXT: s_mov_b32 s34, s6 8733; SI-NEXT: s_mov_b32 s7, 0xf000 8734; SI-NEXT: s_mov_b32 s6, -1 8735; SI-NEXT: v_mov_b32_e32 v0, s34 8736; SI-NEXT: s_waitcnt vmcnt(0) 8737; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc 8738; SI-NEXT: s_waitcnt vmcnt(0) 8739; SI-NEXT: buffer_wbinvl1 8740; SI-NEXT: v_readlane_b32 s7, v1, 1 8741; SI-NEXT: v_readlane_b32 s6, v1, 0 8742; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8743; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 8744; SI-NEXT: s_mov_b64 exec, s[34:35] 8745; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8746; SI-NEXT: s_setpc_b64 s[30:31] 8747; 8748; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: 8749; VI: ; %bb.0: 8750; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8751; VI-NEXT: s_add_u32 s34, s4, 16 8752; VI-NEXT: s_addc_u32 s35, s5, 0 8753; VI-NEXT: v_mov_b32_e32 v0, s34 8754; VI-NEXT: v_mov_b32_e32 v1, s35 8755; VI-NEXT: v_mov_b32_e32 v2, s6 8756; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 8757; VI-NEXT: s_waitcnt vmcnt(0) 8758; VI-NEXT: buffer_wbinvl1_vol 8759; VI-NEXT: s_setpc_b64 s[30:31] 8760; 8761; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: 8762; GFX9: ; %bb.0: 8763; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8764; GFX9-NEXT: v_mov_b32_e32 v0, 0 8765; GFX9-NEXT: v_mov_b32_e32 v1, s6 8766; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc 8767; GFX9-NEXT: s_waitcnt vmcnt(0) 8768; GFX9-NEXT: buffer_wbinvl1_vol 8769; GFX9-NEXT: s_setpc_b64 s[30:31] 8770 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 8771 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst 8772 ret i32 %result 8773} 8774 8775define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 8776; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: 8777; SI: ; %bb.0: 8778; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8779; SI-NEXT: s_mov_b32 s6, 0 8780; SI-NEXT: s_mov_b32 s7, 0xf000 8781; SI-NEXT: s_mov_b32 s4, s6 8782; SI-NEXT: s_mov_b32 s5, s6 8783; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 8784; SI-NEXT: s_waitcnt vmcnt(0) 8785; SI-NEXT: buffer_wbinvl1 8786; SI-NEXT: s_waitcnt expcnt(0) 8787; SI-NEXT: s_setpc_b64 s[30:31] 8788; 8789; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: 8790; VI: ; %bb.0: 8791; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8792; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8793; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8794; VI-NEXT: flat_atomic_inc v[0:1], v2 8795; VI-NEXT: s_waitcnt vmcnt(0) 8796; VI-NEXT: buffer_wbinvl1_vol 8797; VI-NEXT: s_setpc_b64 s[30:31] 8798; 8799; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: 8800; GFX9: ; %bb.0: 8801; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8802; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 8803; GFX9-NEXT: s_waitcnt vmcnt(0) 8804; GFX9-NEXT: buffer_wbinvl1_vol 8805; GFX9-NEXT: s_setpc_b64 s[30:31] 8806 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 8807 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 8808 ret void 8809} 8810 8811define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 8812; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: 8813; SI: ; %bb.0: 8814; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8815; SI-NEXT: s_mov_b32 s6, 0 8816; SI-NEXT: s_mov_b32 s7, 0xf000 8817; SI-NEXT: s_mov_b32 s4, s6 8818; SI-NEXT: s_mov_b32 s5, s6 8819; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 8820; SI-NEXT: s_waitcnt vmcnt(0) 8821; SI-NEXT: buffer_wbinvl1 8822; SI-NEXT: v_mov_b32_e32 v0, v2 8823; SI-NEXT: s_waitcnt expcnt(0) 8824; SI-NEXT: s_setpc_b64 s[30:31] 8825; 8826; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: 8827; VI: ; %bb.0: 8828; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8829; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8830; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8831; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 8832; VI-NEXT: s_waitcnt vmcnt(0) 8833; VI-NEXT: buffer_wbinvl1_vol 8834; VI-NEXT: s_setpc_b64 s[30:31] 8835; 8836; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: 8837; GFX9: ; %bb.0: 8838; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8839; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc 8840; GFX9-NEXT: s_waitcnt vmcnt(0) 8841; GFX9-NEXT: buffer_wbinvl1_vol 8842; GFX9-NEXT: s_setpc_b64 s[30:31] 8843 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 8844 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 8845 ret i32 %result 8846} 8847 8848; --------------------------------------------------------------------- 8849; atomicrmw udec_wrap 8850; --------------------------------------------------------------------- 8851 8852define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { 8853; SI-LABEL: global_atomic_udec_wrap_i32_noret: 8854; SI: ; %bb.0: 8855; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8856; SI-NEXT: s_mov_b32 s6, 0 8857; SI-NEXT: s_mov_b32 s7, 0xf000 8858; SI-NEXT: s_mov_b32 s4, s6 8859; SI-NEXT: s_mov_b32 s5, s6 8860; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 8861; SI-NEXT: s_waitcnt vmcnt(0) 8862; SI-NEXT: buffer_wbinvl1 8863; SI-NEXT: s_waitcnt expcnt(0) 8864; SI-NEXT: s_setpc_b64 s[30:31] 8865; 8866; VI-LABEL: global_atomic_udec_wrap_i32_noret: 8867; VI: ; %bb.0: 8868; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8869; VI-NEXT: flat_atomic_dec v[0:1], v2 8870; VI-NEXT: s_waitcnt vmcnt(0) 8871; VI-NEXT: buffer_wbinvl1_vol 8872; VI-NEXT: s_setpc_b64 s[30:31] 8873; 8874; GFX9-LABEL: global_atomic_udec_wrap_i32_noret: 8875; GFX9: ; %bb.0: 8876; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8877; GFX9-NEXT: global_atomic_dec v[0:1], v2, off 8878; GFX9-NEXT: s_waitcnt vmcnt(0) 8879; GFX9-NEXT: buffer_wbinvl1_vol 8880; GFX9-NEXT: s_setpc_b64 s[30:31] 8881 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 8882 ret void 8883} 8884 8885define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { 8886; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset: 8887; SI: ; %bb.0: 8888; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8889; SI-NEXT: s_mov_b32 s6, 0 8890; SI-NEXT: s_mov_b32 s7, 0xf000 8891; SI-NEXT: s_mov_b32 s4, s6 8892; SI-NEXT: s_mov_b32 s5, s6 8893; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 8894; SI-NEXT: s_waitcnt vmcnt(0) 8895; SI-NEXT: buffer_wbinvl1 8896; SI-NEXT: s_waitcnt expcnt(0) 8897; SI-NEXT: s_setpc_b64 s[30:31] 8898; 8899; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset: 8900; VI: ; %bb.0: 8901; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8902; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8903; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8904; VI-NEXT: flat_atomic_dec v[0:1], v2 8905; VI-NEXT: s_waitcnt vmcnt(0) 8906; VI-NEXT: buffer_wbinvl1_vol 8907; VI-NEXT: s_setpc_b64 s[30:31] 8908; 8909; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset: 8910; GFX9: ; %bb.0: 8911; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8912; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 8913; GFX9-NEXT: s_waitcnt vmcnt(0) 8914; GFX9-NEXT: buffer_wbinvl1_vol 8915; GFX9-NEXT: s_setpc_b64 s[30:31] 8916 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 8917 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst 8918 ret void 8919} 8920 8921define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { 8922; SI-LABEL: global_atomic_udec_wrap_i32_ret: 8923; SI: ; %bb.0: 8924; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8925; SI-NEXT: s_mov_b32 s6, 0 8926; SI-NEXT: s_mov_b32 s7, 0xf000 8927; SI-NEXT: s_mov_b32 s4, s6 8928; SI-NEXT: s_mov_b32 s5, s6 8929; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc 8930; SI-NEXT: s_waitcnt vmcnt(0) 8931; SI-NEXT: buffer_wbinvl1 8932; SI-NEXT: v_mov_b32_e32 v0, v2 8933; SI-NEXT: s_waitcnt expcnt(0) 8934; SI-NEXT: s_setpc_b64 s[30:31] 8935; 8936; VI-LABEL: global_atomic_udec_wrap_i32_ret: 8937; VI: ; %bb.0: 8938; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8939; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 8940; VI-NEXT: s_waitcnt vmcnt(0) 8941; VI-NEXT: buffer_wbinvl1_vol 8942; VI-NEXT: s_setpc_b64 s[30:31] 8943; 8944; GFX9-LABEL: global_atomic_udec_wrap_i32_ret: 8945; GFX9: ; %bb.0: 8946; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8947; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc 8948; GFX9-NEXT: s_waitcnt vmcnt(0) 8949; GFX9-NEXT: buffer_wbinvl1_vol 8950; GFX9-NEXT: s_setpc_b64 s[30:31] 8951 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 8952 ret i32 %result 8953} 8954 8955define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { 8956; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset: 8957; SI: ; %bb.0: 8958; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8959; SI-NEXT: s_mov_b32 s6, 0 8960; SI-NEXT: s_mov_b32 s7, 0xf000 8961; SI-NEXT: s_mov_b32 s4, s6 8962; SI-NEXT: s_mov_b32 s5, s6 8963; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 8964; SI-NEXT: s_waitcnt vmcnt(0) 8965; SI-NEXT: buffer_wbinvl1 8966; SI-NEXT: v_mov_b32_e32 v0, v2 8967; SI-NEXT: s_waitcnt expcnt(0) 8968; SI-NEXT: s_setpc_b64 s[30:31] 8969; 8970; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset: 8971; VI: ; %bb.0: 8972; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8973; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 8974; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8975; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 8976; VI-NEXT: s_waitcnt vmcnt(0) 8977; VI-NEXT: buffer_wbinvl1_vol 8978; VI-NEXT: s_setpc_b64 s[30:31] 8979; 8980; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset: 8981; GFX9: ; %bb.0: 8982; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8983; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc 8984; GFX9-NEXT: s_waitcnt vmcnt(0) 8985; GFX9-NEXT: buffer_wbinvl1_vol 8986; GFX9-NEXT: s_setpc_b64 s[30:31] 8987 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 8988 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst 8989 ret i32 %result 8990} 8991 8992define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 8993; SI-LABEL: global_atomic_udec_wrap_i32_noret_scalar: 8994; SI: ; %bb.0: 8995; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8996; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8997; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 8998; SI-NEXT: s_mov_b64 exec, s[34:35] 8999; SI-NEXT: s_waitcnt expcnt(0) 9000; SI-NEXT: v_writelane_b32 v1, s6, 0 9001; SI-NEXT: v_writelane_b32 v1, s7, 1 9002; SI-NEXT: s_mov_b32 s34, s6 9003; SI-NEXT: s_mov_b32 s7, 0xf000 9004; SI-NEXT: s_mov_b32 s6, -1 9005; SI-NEXT: v_mov_b32_e32 v0, s34 9006; SI-NEXT: s_waitcnt vmcnt(0) 9007; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 9008; SI-NEXT: s_waitcnt vmcnt(0) 9009; SI-NEXT: buffer_wbinvl1 9010; SI-NEXT: v_readlane_b32 s7, v1, 1 9011; SI-NEXT: v_readlane_b32 s6, v1, 0 9012; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9013; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 9014; SI-NEXT: s_mov_b64 exec, s[34:35] 9015; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9016; SI-NEXT: s_setpc_b64 s[30:31] 9017; 9018; VI-LABEL: global_atomic_udec_wrap_i32_noret_scalar: 9019; VI: ; %bb.0: 9020; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9021; VI-NEXT: v_mov_b32_e32 v0, s4 9022; VI-NEXT: v_mov_b32_e32 v1, s5 9023; VI-NEXT: v_mov_b32_e32 v2, s6 9024; VI-NEXT: flat_atomic_dec v[0:1], v2 9025; VI-NEXT: s_waitcnt vmcnt(0) 9026; VI-NEXT: buffer_wbinvl1_vol 9027; VI-NEXT: s_setpc_b64 s[30:31] 9028; 9029; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar: 9030; GFX9: ; %bb.0: 9031; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9032; GFX9-NEXT: v_mov_b32_e32 v0, 0 9033; GFX9-NEXT: v_mov_b32_e32 v1, s6 9034; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] 9035; GFX9-NEXT: s_waitcnt vmcnt(0) 9036; GFX9-NEXT: buffer_wbinvl1_vol 9037; GFX9-NEXT: s_setpc_b64 s[30:31] 9038 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 9039 ret void 9040} 9041 9042define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 9043; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: 9044; SI: ; %bb.0: 9045; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9046; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9047; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 9048; SI-NEXT: s_mov_b64 exec, s[34:35] 9049; SI-NEXT: s_waitcnt expcnt(0) 9050; SI-NEXT: v_writelane_b32 v1, s6, 0 9051; SI-NEXT: v_writelane_b32 v1, s7, 1 9052; SI-NEXT: s_mov_b32 s34, s6 9053; SI-NEXT: s_mov_b32 s7, 0xf000 9054; SI-NEXT: s_mov_b32 s6, -1 9055; SI-NEXT: v_mov_b32_e32 v0, s34 9056; SI-NEXT: s_waitcnt vmcnt(0) 9057; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 9058; SI-NEXT: s_waitcnt vmcnt(0) 9059; SI-NEXT: buffer_wbinvl1 9060; SI-NEXT: v_readlane_b32 s7, v1, 1 9061; SI-NEXT: v_readlane_b32 s6, v1, 0 9062; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9063; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 9064; SI-NEXT: s_mov_b64 exec, s[34:35] 9065; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9066; SI-NEXT: s_setpc_b64 s[30:31] 9067; 9068; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: 9069; VI: ; %bb.0: 9070; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9071; VI-NEXT: s_add_u32 s34, s4, 16 9072; VI-NEXT: s_addc_u32 s35, s5, 0 9073; VI-NEXT: v_mov_b32_e32 v0, s34 9074; VI-NEXT: v_mov_b32_e32 v1, s35 9075; VI-NEXT: v_mov_b32_e32 v2, s6 9076; VI-NEXT: flat_atomic_dec v[0:1], v2 9077; VI-NEXT: s_waitcnt vmcnt(0) 9078; VI-NEXT: buffer_wbinvl1_vol 9079; VI-NEXT: s_setpc_b64 s[30:31] 9080; 9081; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: 9082; GFX9: ; %bb.0: 9083; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9084; GFX9-NEXT: v_mov_b32_e32 v0, 0 9085; GFX9-NEXT: v_mov_b32_e32 v1, s6 9086; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16 9087; GFX9-NEXT: s_waitcnt vmcnt(0) 9088; GFX9-NEXT: buffer_wbinvl1_vol 9089; GFX9-NEXT: s_setpc_b64 s[30:31] 9090 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 9091 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst 9092 ret void 9093} 9094 9095define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) { 9096; SI-LABEL: global_atomic_udec_wrap_i32_ret_scalar: 9097; SI: ; %bb.0: 9098; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9099; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9100; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 9101; SI-NEXT: s_mov_b64 exec, s[34:35] 9102; SI-NEXT: s_waitcnt expcnt(0) 9103; SI-NEXT: v_writelane_b32 v1, s6, 0 9104; SI-NEXT: v_writelane_b32 v1, s7, 1 9105; SI-NEXT: s_mov_b32 s34, s6 9106; SI-NEXT: s_mov_b32 s7, 0xf000 9107; SI-NEXT: s_mov_b32 s6, -1 9108; SI-NEXT: v_mov_b32_e32 v0, s34 9109; SI-NEXT: s_waitcnt vmcnt(0) 9110; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc 9111; SI-NEXT: s_waitcnt vmcnt(0) 9112; SI-NEXT: buffer_wbinvl1 9113; SI-NEXT: v_readlane_b32 s7, v1, 1 9114; SI-NEXT: v_readlane_b32 s6, v1, 0 9115; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9116; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 9117; SI-NEXT: s_mov_b64 exec, s[34:35] 9118; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9119; SI-NEXT: s_setpc_b64 s[30:31] 9120; 9121; VI-LABEL: global_atomic_udec_wrap_i32_ret_scalar: 9122; VI: ; %bb.0: 9123; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9124; VI-NEXT: v_mov_b32_e32 v0, s4 9125; VI-NEXT: v_mov_b32_e32 v1, s5 9126; VI-NEXT: v_mov_b32_e32 v2, s6 9127; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 9128; VI-NEXT: s_waitcnt vmcnt(0) 9129; VI-NEXT: buffer_wbinvl1_vol 9130; VI-NEXT: s_setpc_b64 s[30:31] 9131; 9132; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar: 9133; GFX9: ; %bb.0: 9134; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9135; GFX9-NEXT: v_mov_b32_e32 v0, 0 9136; GFX9-NEXT: v_mov_b32_e32 v1, s6 9137; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc 9138; GFX9-NEXT: s_waitcnt vmcnt(0) 9139; GFX9-NEXT: buffer_wbinvl1_vol 9140; GFX9-NEXT: s_setpc_b64 s[30:31] 9141 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst 9142 ret i32 %result 9143} 9144 9145define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) { 9146; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: 9147; SI: ; %bb.0: 9148; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9149; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9150; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill 9151; SI-NEXT: s_mov_b64 exec, s[34:35] 9152; SI-NEXT: s_waitcnt expcnt(0) 9153; SI-NEXT: v_writelane_b32 v1, s6, 0 9154; SI-NEXT: v_writelane_b32 v1, s7, 1 9155; SI-NEXT: s_mov_b32 s34, s6 9156; SI-NEXT: s_mov_b32 s7, 0xf000 9157; SI-NEXT: s_mov_b32 s6, -1 9158; SI-NEXT: v_mov_b32_e32 v0, s34 9159; SI-NEXT: s_waitcnt vmcnt(0) 9160; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc 9161; SI-NEXT: s_waitcnt vmcnt(0) 9162; SI-NEXT: buffer_wbinvl1 9163; SI-NEXT: v_readlane_b32 s7, v1, 1 9164; SI-NEXT: v_readlane_b32 s6, v1, 0 9165; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9166; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload 9167; SI-NEXT: s_mov_b64 exec, s[34:35] 9168; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9169; SI-NEXT: s_setpc_b64 s[30:31] 9170; 9171; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: 9172; VI: ; %bb.0: 9173; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9174; VI-NEXT: s_add_u32 s34, s4, 16 9175; VI-NEXT: s_addc_u32 s35, s5, 0 9176; VI-NEXT: v_mov_b32_e32 v0, s34 9177; VI-NEXT: v_mov_b32_e32 v1, s35 9178; VI-NEXT: v_mov_b32_e32 v2, s6 9179; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 9180; VI-NEXT: s_waitcnt vmcnt(0) 9181; VI-NEXT: buffer_wbinvl1_vol 9182; VI-NEXT: s_setpc_b64 s[30:31] 9183; 9184; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: 9185; GFX9: ; %bb.0: 9186; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9187; GFX9-NEXT: v_mov_b32_e32 v0, 0 9188; GFX9-NEXT: v_mov_b32_e32 v1, s6 9189; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc 9190; GFX9-NEXT: s_waitcnt vmcnt(0) 9191; GFX9-NEXT: buffer_wbinvl1_vol 9192; GFX9-NEXT: s_setpc_b64 s[30:31] 9193 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 9194 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst 9195 ret i32 %result 9196} 9197 9198define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 9199; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: 9200; SI: ; %bb.0: 9201; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9202; SI-NEXT: s_mov_b32 s6, 0 9203; SI-NEXT: s_mov_b32 s7, 0xf000 9204; SI-NEXT: s_mov_b32 s4, s6 9205; SI-NEXT: s_mov_b32 s5, s6 9206; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 9207; SI-NEXT: s_waitcnt vmcnt(0) 9208; SI-NEXT: buffer_wbinvl1 9209; SI-NEXT: s_waitcnt expcnt(0) 9210; SI-NEXT: s_setpc_b64 s[30:31] 9211; 9212; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: 9213; VI: ; %bb.0: 9214; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9215; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 9216; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9217; VI-NEXT: flat_atomic_dec v[0:1], v2 9218; VI-NEXT: s_waitcnt vmcnt(0) 9219; VI-NEXT: buffer_wbinvl1_vol 9220; VI-NEXT: s_setpc_b64 s[30:31] 9221; 9222; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: 9223; GFX9: ; %bb.0: 9224; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9225; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 9226; GFX9-NEXT: s_waitcnt vmcnt(0) 9227; GFX9-NEXT: buffer_wbinvl1_vol 9228; GFX9-NEXT: s_setpc_b64 s[30:31] 9229 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 9230 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 9231 ret void 9232} 9233 9234define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) { 9235; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: 9236; SI: ; %bb.0: 9237; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9238; SI-NEXT: s_mov_b32 s6, 0 9239; SI-NEXT: s_mov_b32 s7, 0xf000 9240; SI-NEXT: s_mov_b32 s4, s6 9241; SI-NEXT: s_mov_b32 s5, s6 9242; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc 9243; SI-NEXT: s_waitcnt vmcnt(0) 9244; SI-NEXT: buffer_wbinvl1 9245; SI-NEXT: v_mov_b32_e32 v0, v2 9246; SI-NEXT: s_waitcnt expcnt(0) 9247; SI-NEXT: s_setpc_b64 s[30:31] 9248; 9249; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: 9250; VI: ; %bb.0: 9251; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9252; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 9253; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9254; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 9255; VI-NEXT: s_waitcnt vmcnt(0) 9256; VI-NEXT: buffer_wbinvl1_vol 9257; VI-NEXT: s_setpc_b64 s[30:31] 9258; 9259; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: 9260; GFX9: ; %bb.0: 9261; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9262; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc 9263; GFX9-NEXT: s_waitcnt vmcnt(0) 9264; GFX9-NEXT: buffer_wbinvl1_vol 9265; GFX9-NEXT: s_setpc_b64 s[30:31] 9266 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 9267 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 9268 ret i32 %result 9269} 9270 9271!0 = !{} 9272