1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5 6; --------------------------------------------------------------------- 7; atomicrmw xchg 8; --------------------------------------------------------------------- 9 10define void @global_atomic_xchg_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 11; SI-LABEL: global_atomic_xchg_i64_noret: 12; SI: ; %bb.0: 13; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s6, 0 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s4, s6 17; SI-NEXT: s_mov_b32 s5, s6 18; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 19; SI-NEXT: s_waitcnt vmcnt(0) 20; SI-NEXT: buffer_wbinvl1 21; SI-NEXT: s_waitcnt expcnt(0) 22; SI-NEXT: s_setpc_b64 s[30:31] 23; 24; VI-LABEL: global_atomic_xchg_i64_noret: 25; VI: ; %bb.0: 26; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] 28; VI-NEXT: s_waitcnt vmcnt(0) 29; VI-NEXT: buffer_wbinvl1_vol 30; VI-NEXT: s_setpc_b64 s[30:31] 31; 32; GFX9-LABEL: global_atomic_xchg_i64_noret: 33; GFX9: ; %bb.0: 34; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off 36; GFX9-NEXT: s_waitcnt vmcnt(0) 37; GFX9-NEXT: buffer_wbinvl1_vol 38; GFX9-NEXT: s_setpc_b64 s[30:31] 39 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst 40 ret void 41} 42 43define void @global_atomic_xchg_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 44; SI-LABEL: global_atomic_xchg_i64_noret_offset: 45; SI: ; %bb.0: 46; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; SI-NEXT: s_mov_b32 s6, 0 48; SI-NEXT: s_mov_b32 s7, 0xf000 49; SI-NEXT: s_mov_b32 s4, s6 50; SI-NEXT: s_mov_b32 s5, s6 51; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_wbinvl1 54; SI-NEXT: s_waitcnt expcnt(0) 55; SI-NEXT: s_setpc_b64 s[30:31] 56; 57; VI-LABEL: global_atomic_xchg_i64_noret_offset: 58; VI: ; %bb.0: 59; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 61; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 62; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] 63; VI-NEXT: s_waitcnt vmcnt(0) 64; VI-NEXT: buffer_wbinvl1_vol 65; VI-NEXT: s_setpc_b64 s[30:31] 66; 67; GFX9-LABEL: global_atomic_xchg_i64_noret_offset: 68; GFX9: ; %bb.0: 69; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 71; GFX9-NEXT: s_waitcnt vmcnt(0) 72; GFX9-NEXT: buffer_wbinvl1_vol 73; GFX9-NEXT: s_setpc_b64 s[30:31] 74 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 75 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst 76 ret void 77} 78 79define i64 @global_atomic_xchg_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 80; SI-LABEL: global_atomic_xchg_i64_ret: 81; SI: ; %bb.0: 82; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; SI-NEXT: s_mov_b32 s6, 0 84; SI-NEXT: s_mov_b32 s7, 0xf000 85; SI-NEXT: s_mov_b32 s4, s6 86; SI-NEXT: s_mov_b32 s5, s6 87; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 88; SI-NEXT: s_waitcnt vmcnt(0) 89; SI-NEXT: buffer_wbinvl1 90; SI-NEXT: v_mov_b32_e32 v0, v2 91; SI-NEXT: v_mov_b32_e32 v1, v3 92; SI-NEXT: s_waitcnt expcnt(0) 93; SI-NEXT: s_setpc_b64 s[30:31] 94; 95; VI-LABEL: global_atomic_xchg_i64_ret: 96; VI: ; %bb.0: 97; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 98; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc 99; VI-NEXT: s_waitcnt vmcnt(0) 100; VI-NEXT: buffer_wbinvl1_vol 101; VI-NEXT: s_setpc_b64 s[30:31] 102; 103; GFX9-LABEL: global_atomic_xchg_i64_ret: 104; GFX9: ; %bb.0: 105; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc 107; GFX9-NEXT: s_waitcnt vmcnt(0) 108; GFX9-NEXT: buffer_wbinvl1_vol 109; GFX9-NEXT: s_setpc_b64 s[30:31] 110 %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst 111 ret i64 %result 112} 113 114define i64 @global_atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 115; SI-LABEL: global_atomic_xchg_i64_ret_offset: 116; SI: ; %bb.0: 117; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 118; SI-NEXT: s_mov_b32 s6, 0 119; SI-NEXT: s_mov_b32 s7, 0xf000 120; SI-NEXT: s_mov_b32 s4, s6 121; SI-NEXT: s_mov_b32 s5, s6 122; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 123; SI-NEXT: s_waitcnt vmcnt(0) 124; SI-NEXT: buffer_wbinvl1 125; SI-NEXT: v_mov_b32_e32 v0, v2 126; SI-NEXT: v_mov_b32_e32 v1, v3 127; SI-NEXT: s_waitcnt expcnt(0) 128; SI-NEXT: s_setpc_b64 s[30:31] 129; 130; VI-LABEL: global_atomic_xchg_i64_ret_offset: 131; VI: ; %bb.0: 132; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 133; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 134; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 135; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc 136; VI-NEXT: s_waitcnt vmcnt(0) 137; VI-NEXT: buffer_wbinvl1_vol 138; VI-NEXT: s_setpc_b64 s[30:31] 139; 140; GFX9-LABEL: global_atomic_xchg_i64_ret_offset: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 143; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 144; GFX9-NEXT: s_waitcnt vmcnt(0) 145; GFX9-NEXT: buffer_wbinvl1_vol 146; GFX9-NEXT: s_setpc_b64 s[30:31] 147 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 148 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst 149 ret i64 %result 150} 151 152define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 153; SI-LABEL: global_atomic_xchg_i64_noret_scalar: 154; SI: ; %bb.0: 155; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 156; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 157; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 158; SI-NEXT: s_mov_b64 exec, s[34:35] 159; SI-NEXT: s_waitcnt expcnt(0) 160; SI-NEXT: v_writelane_b32 v2, s6, 0 161; SI-NEXT: v_writelane_b32 v2, s7, 1 162; SI-NEXT: s_mov_b32 s34, s7 163; SI-NEXT: s_mov_b32 s35, s6 164; SI-NEXT: s_mov_b32 s7, 0xf000 165; SI-NEXT: s_mov_b32 s6, -1 166; SI-NEXT: v_mov_b32_e32 v0, s35 167; SI-NEXT: v_mov_b32_e32 v1, s34 168; SI-NEXT: s_waitcnt vmcnt(0) 169; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 170; SI-NEXT: s_waitcnt vmcnt(0) 171; SI-NEXT: buffer_wbinvl1 172; SI-NEXT: v_readlane_b32 s7, v2, 1 173; SI-NEXT: v_readlane_b32 s6, v2, 0 174; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 175; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 176; SI-NEXT: s_mov_b64 exec, s[34:35] 177; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 178; SI-NEXT: s_setpc_b64 s[30:31] 179; 180; VI-LABEL: global_atomic_xchg_i64_noret_scalar: 181; VI: ; %bb.0: 182; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183; VI-NEXT: v_mov_b32_e32 v0, s6 184; VI-NEXT: v_mov_b32_e32 v1, s7 185; VI-NEXT: v_mov_b32_e32 v2, s4 186; VI-NEXT: v_mov_b32_e32 v3, s5 187; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] 188; VI-NEXT: s_waitcnt vmcnt(0) 189; VI-NEXT: buffer_wbinvl1_vol 190; VI-NEXT: s_setpc_b64 s[30:31] 191; 192; GFX9-LABEL: global_atomic_xchg_i64_noret_scalar: 193; GFX9: ; %bb.0: 194; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; GFX9-NEXT: v_mov_b32_e32 v0, s6 196; GFX9-NEXT: v_mov_b32_e32 v1, s7 197; GFX9-NEXT: v_mov_b32_e32 v2, 0 198; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] 199; GFX9-NEXT: s_waitcnt vmcnt(0) 200; GFX9-NEXT: buffer_wbinvl1_vol 201; GFX9-NEXT: s_setpc_b64 s[30:31] 202 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst 203 ret void 204} 205 206define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 207; SI-LABEL: global_atomic_xchg_i64_noret_offset_scalar: 208; SI: ; %bb.0: 209; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 211; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 212; SI-NEXT: s_mov_b64 exec, s[34:35] 213; SI-NEXT: s_waitcnt expcnt(0) 214; SI-NEXT: v_writelane_b32 v2, s6, 0 215; SI-NEXT: v_writelane_b32 v2, s7, 1 216; SI-NEXT: v_mov_b32_e32 v0, s6 217; SI-NEXT: v_mov_b32_e32 v1, s7 218; SI-NEXT: s_mov_b32 s7, 0xf000 219; SI-NEXT: s_mov_b32 s6, -1 220; SI-NEXT: s_waitcnt vmcnt(0) 221; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 222; SI-NEXT: s_waitcnt vmcnt(0) 223; SI-NEXT: buffer_wbinvl1 224; SI-NEXT: v_readlane_b32 s7, v2, 1 225; SI-NEXT: v_readlane_b32 s6, v2, 0 226; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 227; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 228; SI-NEXT: s_mov_b64 exec, s[34:35] 229; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 230; SI-NEXT: s_setpc_b64 s[30:31] 231; 232; VI-LABEL: global_atomic_xchg_i64_noret_offset_scalar: 233; VI: ; %bb.0: 234; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 235; VI-NEXT: s_add_u32 s34, s4, 32 236; VI-NEXT: s_addc_u32 s35, s5, 0 237; VI-NEXT: v_mov_b32_e32 v2, s34 238; VI-NEXT: v_mov_b32_e32 v0, s6 239; VI-NEXT: v_mov_b32_e32 v1, s7 240; VI-NEXT: v_mov_b32_e32 v3, s35 241; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] 242; VI-NEXT: s_waitcnt vmcnt(0) 243; VI-NEXT: buffer_wbinvl1_vol 244; VI-NEXT: s_setpc_b64 s[30:31] 245; 246; GFX9-LABEL: global_atomic_xchg_i64_noret_offset_scalar: 247; GFX9: ; %bb.0: 248; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 249; GFX9-NEXT: v_mov_b32_e32 v0, s6 250; GFX9-NEXT: v_mov_b32_e32 v1, s7 251; GFX9-NEXT: v_mov_b32_e32 v2, 0 252; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 253; GFX9-NEXT: s_waitcnt vmcnt(0) 254; GFX9-NEXT: buffer_wbinvl1_vol 255; GFX9-NEXT: s_setpc_b64 s[30:31] 256 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 257 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst 258 ret void 259} 260 261define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 262; SI-LABEL: global_atomic_xchg_i64_ret_scalar: 263; SI: ; %bb.0: 264; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 266; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 267; SI-NEXT: s_mov_b64 exec, s[34:35] 268; SI-NEXT: s_waitcnt expcnt(0) 269; SI-NEXT: v_writelane_b32 v2, s6, 0 270; SI-NEXT: v_writelane_b32 v2, s7, 1 271; SI-NEXT: s_mov_b32 s34, s7 272; SI-NEXT: s_mov_b32 s35, s6 273; SI-NEXT: s_mov_b32 s7, 0xf000 274; SI-NEXT: s_mov_b32 s6, -1 275; SI-NEXT: v_mov_b32_e32 v0, s35 276; SI-NEXT: v_mov_b32_e32 v1, s34 277; SI-NEXT: s_waitcnt vmcnt(0) 278; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc 279; SI-NEXT: s_waitcnt vmcnt(0) 280; SI-NEXT: buffer_wbinvl1 281; SI-NEXT: v_readlane_b32 s7, v2, 1 282; SI-NEXT: v_readlane_b32 s6, v2, 0 283; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 284; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 285; SI-NEXT: s_mov_b64 exec, s[34:35] 286; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 287; SI-NEXT: s_setpc_b64 s[30:31] 288; 289; VI-LABEL: global_atomic_xchg_i64_ret_scalar: 290; VI: ; %bb.0: 291; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; VI-NEXT: v_mov_b32_e32 v0, s6 293; VI-NEXT: v_mov_b32_e32 v1, s7 294; VI-NEXT: v_mov_b32_e32 v2, s4 295; VI-NEXT: v_mov_b32_e32 v3, s5 296; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc 297; VI-NEXT: s_waitcnt vmcnt(0) 298; VI-NEXT: buffer_wbinvl1_vol 299; VI-NEXT: s_setpc_b64 s[30:31] 300; 301; GFX9-LABEL: global_atomic_xchg_i64_ret_scalar: 302; GFX9: ; %bb.0: 303; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 304; GFX9-NEXT: v_mov_b32_e32 v0, s6 305; GFX9-NEXT: v_mov_b32_e32 v1, s7 306; GFX9-NEXT: v_mov_b32_e32 v2, 0 307; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc 308; GFX9-NEXT: s_waitcnt vmcnt(0) 309; GFX9-NEXT: buffer_wbinvl1_vol 310; GFX9-NEXT: s_setpc_b64 s[30:31] 311 %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst 312 ret i64 %result 313} 314 315define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 316; SI-LABEL: global_atomic_xchg_i64_ret_offset_scalar: 317; SI: ; %bb.0: 318; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 320; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 321; SI-NEXT: s_mov_b64 exec, s[34:35] 322; SI-NEXT: s_waitcnt expcnt(0) 323; SI-NEXT: v_writelane_b32 v2, s6, 0 324; SI-NEXT: v_writelane_b32 v2, s7, 1 325; SI-NEXT: v_mov_b32_e32 v0, s6 326; SI-NEXT: v_mov_b32_e32 v1, s7 327; SI-NEXT: s_mov_b32 s7, 0xf000 328; SI-NEXT: s_mov_b32 s6, -1 329; SI-NEXT: s_waitcnt vmcnt(0) 330; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc 331; SI-NEXT: s_waitcnt vmcnt(0) 332; SI-NEXT: buffer_wbinvl1 333; SI-NEXT: v_readlane_b32 s7, v2, 1 334; SI-NEXT: v_readlane_b32 s6, v2, 0 335; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 336; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 337; SI-NEXT: s_mov_b64 exec, s[34:35] 338; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 339; SI-NEXT: s_setpc_b64 s[30:31] 340; 341; VI-LABEL: global_atomic_xchg_i64_ret_offset_scalar: 342; VI: ; %bb.0: 343; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; VI-NEXT: s_add_u32 s34, s4, 32 345; VI-NEXT: s_addc_u32 s35, s5, 0 346; VI-NEXT: v_mov_b32_e32 v2, s34 347; VI-NEXT: v_mov_b32_e32 v0, s6 348; VI-NEXT: v_mov_b32_e32 v1, s7 349; VI-NEXT: v_mov_b32_e32 v3, s35 350; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc 351; VI-NEXT: s_waitcnt vmcnt(0) 352; VI-NEXT: buffer_wbinvl1_vol 353; VI-NEXT: s_setpc_b64 s[30:31] 354; 355; GFX9-LABEL: global_atomic_xchg_i64_ret_offset_scalar: 356; GFX9: ; %bb.0: 357; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 358; GFX9-NEXT: v_mov_b32_e32 v0, s6 359; GFX9-NEXT: v_mov_b32_e32 v1, s7 360; GFX9-NEXT: v_mov_b32_e32 v2, 0 361; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 362; GFX9-NEXT: s_waitcnt vmcnt(0) 363; GFX9-NEXT: buffer_wbinvl1_vol 364; GFX9-NEXT: s_setpc_b64 s[30:31] 365 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 366 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst 367 ret i64 %result 368} 369 370define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 371; SI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: 372; SI: ; %bb.0: 373; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 374; SI-NEXT: s_mov_b32 s6, 0 375; SI-NEXT: s_mov_b32 s7, 0xf000 376; SI-NEXT: s_mov_b32 s4, s6 377; SI-NEXT: s_mov_b32 s5, s6 378; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 379; SI-NEXT: s_waitcnt vmcnt(0) 380; SI-NEXT: buffer_wbinvl1 381; SI-NEXT: s_waitcnt expcnt(0) 382; SI-NEXT: s_setpc_b64 s[30:31] 383; 384; VI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: 385; VI: ; %bb.0: 386; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 388; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 389; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] 390; VI-NEXT: s_waitcnt vmcnt(0) 391; VI-NEXT: buffer_wbinvl1_vol 392; VI-NEXT: s_setpc_b64 s[30:31] 393; 394; GFX9-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: 395; GFX9: ; %bb.0: 396; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 397; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 398; GFX9-NEXT: s_waitcnt vmcnt(0) 399; GFX9-NEXT: buffer_wbinvl1_vol 400; GFX9-NEXT: s_setpc_b64 s[30:31] 401 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 402 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 403 ret void 404} 405 406define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 407; SI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: 408; SI: ; %bb.0: 409; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 410; SI-NEXT: s_mov_b32 s6, 0 411; SI-NEXT: s_mov_b32 s7, 0xf000 412; SI-NEXT: s_mov_b32 s4, s6 413; SI-NEXT: s_mov_b32 s5, s6 414; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 415; SI-NEXT: s_waitcnt vmcnt(0) 416; SI-NEXT: buffer_wbinvl1 417; SI-NEXT: v_mov_b32_e32 v0, v2 418; SI-NEXT: v_mov_b32_e32 v1, v3 419; SI-NEXT: s_waitcnt expcnt(0) 420; SI-NEXT: s_setpc_b64 s[30:31] 421; 422; VI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: 423; VI: ; %bb.0: 424; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 425; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 426; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 427; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc 428; VI-NEXT: s_waitcnt vmcnt(0) 429; VI-NEXT: buffer_wbinvl1_vol 430; VI-NEXT: s_setpc_b64 s[30:31] 431; 432; GFX9-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: 433; GFX9: ; %bb.0: 434; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 435; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 436; GFX9-NEXT: s_waitcnt vmcnt(0) 437; GFX9-NEXT: buffer_wbinvl1_vol 438; GFX9-NEXT: s_setpc_b64 s[30:31] 439 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 440 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 441 ret i64 %result 442} 443 444; --------------------------------------------------------------------- 445; atomicrmw xchg f64 446; --------------------------------------------------------------------- 447 448define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) { 449; SI-LABEL: global_atomic_xchg_f64_noret: 450; SI: ; %bb.0: 451; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 452; SI-NEXT: s_mov_b32 s6, 0 453; SI-NEXT: s_mov_b32 s7, 0xf000 454; SI-NEXT: s_mov_b32 s4, s6 455; SI-NEXT: s_mov_b32 s5, s6 456; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 457; SI-NEXT: s_waitcnt vmcnt(0) 458; SI-NEXT: buffer_wbinvl1 459; SI-NEXT: s_waitcnt expcnt(0) 460; SI-NEXT: s_setpc_b64 s[30:31] 461; 462; VI-LABEL: global_atomic_xchg_f64_noret: 463; VI: ; %bb.0: 464; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 465; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] 466; VI-NEXT: s_waitcnt vmcnt(0) 467; VI-NEXT: buffer_wbinvl1_vol 468; VI-NEXT: s_setpc_b64 s[30:31] 469; 470; GFX9-LABEL: global_atomic_xchg_f64_noret: 471; GFX9: ; %bb.0: 472; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off 474; GFX9-NEXT: s_waitcnt vmcnt(0) 475; GFX9-NEXT: buffer_wbinvl1_vol 476; GFX9-NEXT: s_setpc_b64 s[30:31] 477 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst 478 ret void 479} 480 481define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) { 482; SI-LABEL: global_atomic_xchg_f64_noret_offset: 483; SI: ; %bb.0: 484; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 485; SI-NEXT: s_mov_b32 s6, 0 486; SI-NEXT: s_mov_b32 s7, 0xf000 487; SI-NEXT: s_mov_b32 s4, s6 488; SI-NEXT: s_mov_b32 s5, s6 489; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 490; SI-NEXT: s_waitcnt vmcnt(0) 491; SI-NEXT: buffer_wbinvl1 492; SI-NEXT: s_waitcnt expcnt(0) 493; SI-NEXT: s_setpc_b64 s[30:31] 494; 495; VI-LABEL: global_atomic_xchg_f64_noret_offset: 496; VI: ; %bb.0: 497; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 498; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 499; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 500; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] 501; VI-NEXT: s_waitcnt vmcnt(0) 502; VI-NEXT: buffer_wbinvl1_vol 503; VI-NEXT: s_setpc_b64 s[30:31] 504; 505; GFX9-LABEL: global_atomic_xchg_f64_noret_offset: 506; GFX9: ; %bb.0: 507; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 508; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 509; GFX9-NEXT: s_waitcnt vmcnt(0) 510; GFX9-NEXT: buffer_wbinvl1_vol 511; GFX9-NEXT: s_setpc_b64 s[30:31] 512 %gep = getelementptr double, ptr addrspace(1) %out, i32 4 513 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst 514 ret void 515} 516 517define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) { 518; SI-LABEL: global_atomic_xchg_f64_ret: 519; SI: ; %bb.0: 520; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 521; SI-NEXT: s_mov_b32 s6, 0 522; SI-NEXT: s_mov_b32 s7, 0xf000 523; SI-NEXT: s_mov_b32 s4, s6 524; SI-NEXT: s_mov_b32 s5, s6 525; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 526; SI-NEXT: s_waitcnt vmcnt(0) 527; SI-NEXT: buffer_wbinvl1 528; SI-NEXT: v_mov_b32_e32 v0, v2 529; SI-NEXT: v_mov_b32_e32 v1, v3 530; SI-NEXT: s_waitcnt expcnt(0) 531; SI-NEXT: s_setpc_b64 s[30:31] 532; 533; VI-LABEL: global_atomic_xchg_f64_ret: 534; VI: ; %bb.0: 535; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 536; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc 537; VI-NEXT: s_waitcnt vmcnt(0) 538; VI-NEXT: buffer_wbinvl1_vol 539; VI-NEXT: s_setpc_b64 s[30:31] 540; 541; GFX9-LABEL: global_atomic_xchg_f64_ret: 542; GFX9: ; %bb.0: 543; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 544; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc 545; GFX9-NEXT: s_waitcnt vmcnt(0) 546; GFX9-NEXT: buffer_wbinvl1_vol 547; GFX9-NEXT: s_setpc_b64 s[30:31] 548 %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst 549 ret double %result 550} 551 552define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) { 553; SI-LABEL: global_atomic_xchg_f64_ret_offset: 554; SI: ; %bb.0: 555; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 556; SI-NEXT: s_mov_b32 s6, 0 557; SI-NEXT: s_mov_b32 s7, 0xf000 558; SI-NEXT: s_mov_b32 s4, s6 559; SI-NEXT: s_mov_b32 s5, s6 560; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 561; SI-NEXT: s_waitcnt vmcnt(0) 562; SI-NEXT: buffer_wbinvl1 563; SI-NEXT: v_mov_b32_e32 v0, v2 564; SI-NEXT: v_mov_b32_e32 v1, v3 565; SI-NEXT: s_waitcnt expcnt(0) 566; SI-NEXT: s_setpc_b64 s[30:31] 567; 568; VI-LABEL: global_atomic_xchg_f64_ret_offset: 569; VI: ; %bb.0: 570; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 571; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 572; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 573; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc 574; VI-NEXT: s_waitcnt vmcnt(0) 575; VI-NEXT: buffer_wbinvl1_vol 576; VI-NEXT: s_setpc_b64 s[30:31] 577; 578; GFX9-LABEL: global_atomic_xchg_f64_ret_offset: 579; GFX9: ; %bb.0: 580; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 581; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 582; GFX9-NEXT: s_waitcnt vmcnt(0) 583; GFX9-NEXT: buffer_wbinvl1_vol 584; GFX9-NEXT: s_setpc_b64 s[30:31] 585 %gep = getelementptr double, ptr addrspace(1) %out, i32 4 586 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst 587 ret double %result 588} 589 590define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) { 591; SI-LABEL: global_atomic_xchg_f64_noret_scalar: 592; SI: ; %bb.0: 593; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 594; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 595; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 596; SI-NEXT: s_mov_b64 exec, s[34:35] 597; SI-NEXT: s_waitcnt expcnt(0) 598; SI-NEXT: v_writelane_b32 v2, s6, 0 599; SI-NEXT: v_writelane_b32 v2, s7, 1 600; SI-NEXT: s_mov_b32 s34, s7 601; SI-NEXT: s_mov_b32 s35, s6 602; SI-NEXT: s_mov_b32 s7, 0xf000 603; SI-NEXT: s_mov_b32 s6, -1 604; SI-NEXT: v_mov_b32_e32 v0, s35 605; SI-NEXT: v_mov_b32_e32 v1, s34 606; SI-NEXT: s_waitcnt vmcnt(0) 607; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 608; SI-NEXT: s_waitcnt vmcnt(0) 609; SI-NEXT: buffer_wbinvl1 610; SI-NEXT: v_readlane_b32 s7, v2, 1 611; SI-NEXT: v_readlane_b32 s6, v2, 0 612; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 613; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 614; SI-NEXT: s_mov_b64 exec, s[34:35] 615; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 616; SI-NEXT: s_setpc_b64 s[30:31] 617; 618; VI-LABEL: global_atomic_xchg_f64_noret_scalar: 619; VI: ; %bb.0: 620; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 621; VI-NEXT: v_mov_b32_e32 v0, s6 622; VI-NEXT: v_mov_b32_e32 v1, s7 623; VI-NEXT: v_mov_b32_e32 v2, s4 624; VI-NEXT: v_mov_b32_e32 v3, s5 625; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] 626; VI-NEXT: s_waitcnt vmcnt(0) 627; VI-NEXT: buffer_wbinvl1_vol 628; VI-NEXT: s_setpc_b64 s[30:31] 629; 630; GFX9-LABEL: global_atomic_xchg_f64_noret_scalar: 631; GFX9: ; %bb.0: 632; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 633; GFX9-NEXT: v_mov_b32_e32 v0, s6 634; GFX9-NEXT: v_mov_b32_e32 v1, s7 635; GFX9-NEXT: v_mov_b32_e32 v2, 0 636; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] 637; GFX9-NEXT: s_waitcnt vmcnt(0) 638; GFX9-NEXT: buffer_wbinvl1_vol 639; GFX9-NEXT: s_setpc_b64 s[30:31] 640 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst 641 ret void 642} 643 644define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) { 645; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar: 646; SI: ; %bb.0: 647; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 648; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 649; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 650; SI-NEXT: s_mov_b64 exec, s[34:35] 651; SI-NEXT: s_waitcnt expcnt(0) 652; SI-NEXT: v_writelane_b32 v2, s6, 0 653; SI-NEXT: v_writelane_b32 v2, s7, 1 654; SI-NEXT: v_mov_b32_e32 v0, s6 655; SI-NEXT: v_mov_b32_e32 v1, s7 656; SI-NEXT: s_mov_b32 s7, 0xf000 657; SI-NEXT: s_mov_b32 s6, -1 658; SI-NEXT: s_waitcnt vmcnt(0) 659; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 660; SI-NEXT: s_waitcnt vmcnt(0) 661; SI-NEXT: buffer_wbinvl1 662; SI-NEXT: v_readlane_b32 s7, v2, 1 663; SI-NEXT: v_readlane_b32 s6, v2, 0 664; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 665; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 666; SI-NEXT: s_mov_b64 exec, s[34:35] 667; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 668; SI-NEXT: s_setpc_b64 s[30:31] 669; 670; VI-LABEL: global_atomic_xchg_f64_noret_offset_scalar: 671; VI: ; %bb.0: 672; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 673; VI-NEXT: s_add_u32 s34, s4, 32 674; VI-NEXT: s_addc_u32 s35, s5, 0 675; VI-NEXT: v_mov_b32_e32 v2, s34 676; VI-NEXT: v_mov_b32_e32 v0, s6 677; VI-NEXT: v_mov_b32_e32 v1, s7 678; VI-NEXT: v_mov_b32_e32 v3, s35 679; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] 680; VI-NEXT: s_waitcnt vmcnt(0) 681; VI-NEXT: buffer_wbinvl1_vol 682; VI-NEXT: s_setpc_b64 s[30:31] 683; 684; GFX9-LABEL: global_atomic_xchg_f64_noret_offset_scalar: 685; GFX9: ; %bb.0: 686; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 687; GFX9-NEXT: v_mov_b32_e32 v0, s6 688; GFX9-NEXT: v_mov_b32_e32 v1, s7 689; GFX9-NEXT: v_mov_b32_e32 v2, 0 690; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 691; GFX9-NEXT: s_waitcnt vmcnt(0) 692; GFX9-NEXT: buffer_wbinvl1_vol 693; GFX9-NEXT: s_setpc_b64 s[30:31] 694 %gep = getelementptr double, ptr addrspace(1) %out, i32 4 695 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst 696 ret void 697} 698 699define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) { 700; SI-LABEL: global_atomic_xchg_f64_ret_scalar: 701; SI: ; %bb.0: 702; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 703; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 704; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 705; SI-NEXT: s_mov_b64 exec, s[34:35] 706; SI-NEXT: s_waitcnt expcnt(0) 707; SI-NEXT: v_writelane_b32 v2, s6, 0 708; SI-NEXT: v_writelane_b32 v2, s7, 1 709; SI-NEXT: s_mov_b32 s34, s7 710; SI-NEXT: s_mov_b32 s35, s6 711; SI-NEXT: s_mov_b32 s7, 0xf000 712; SI-NEXT: s_mov_b32 s6, -1 713; SI-NEXT: v_mov_b32_e32 v0, s35 714; SI-NEXT: v_mov_b32_e32 v1, s34 715; SI-NEXT: s_waitcnt vmcnt(0) 716; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc 717; SI-NEXT: s_waitcnt vmcnt(0) 718; SI-NEXT: buffer_wbinvl1 719; SI-NEXT: v_readlane_b32 s7, v2, 1 720; SI-NEXT: v_readlane_b32 s6, v2, 0 721; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 722; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 723; SI-NEXT: s_mov_b64 exec, s[34:35] 724; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 725; SI-NEXT: s_setpc_b64 s[30:31] 726; 727; VI-LABEL: global_atomic_xchg_f64_ret_scalar: 728; VI: ; %bb.0: 729; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 730; VI-NEXT: v_mov_b32_e32 v0, s6 731; VI-NEXT: v_mov_b32_e32 v1, s7 732; VI-NEXT: v_mov_b32_e32 v2, s4 733; VI-NEXT: v_mov_b32_e32 v3, s5 734; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc 735; VI-NEXT: s_waitcnt vmcnt(0) 736; VI-NEXT: buffer_wbinvl1_vol 737; VI-NEXT: s_setpc_b64 s[30:31] 738; 739; GFX9-LABEL: global_atomic_xchg_f64_ret_scalar: 740; GFX9: ; %bb.0: 741; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; GFX9-NEXT: v_mov_b32_e32 v0, s6 743; GFX9-NEXT: v_mov_b32_e32 v1, s7 744; GFX9-NEXT: v_mov_b32_e32 v2, 0 745; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc 746; GFX9-NEXT: s_waitcnt vmcnt(0) 747; GFX9-NEXT: buffer_wbinvl1_vol 748; GFX9-NEXT: s_setpc_b64 s[30:31] 749 %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst 750 ret double %result 751} 752 753define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) { 754; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar: 755; SI: ; %bb.0: 756; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 757; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 758; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 759; SI-NEXT: s_mov_b64 exec, s[34:35] 760; SI-NEXT: s_waitcnt expcnt(0) 761; SI-NEXT: v_writelane_b32 v2, s6, 0 762; SI-NEXT: v_writelane_b32 v2, s7, 1 763; SI-NEXT: v_mov_b32_e32 v0, s6 764; SI-NEXT: v_mov_b32_e32 v1, s7 765; SI-NEXT: s_mov_b32 s7, 0xf000 766; SI-NEXT: s_mov_b32 s6, -1 767; SI-NEXT: s_waitcnt vmcnt(0) 768; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc 769; SI-NEXT: s_waitcnt vmcnt(0) 770; SI-NEXT: buffer_wbinvl1 771; SI-NEXT: v_readlane_b32 s7, v2, 1 772; SI-NEXT: v_readlane_b32 s6, v2, 0 773; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 774; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 775; SI-NEXT: s_mov_b64 exec, s[34:35] 776; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 777; SI-NEXT: s_setpc_b64 s[30:31] 778; 779; VI-LABEL: global_atomic_xchg_f64_ret_offset_scalar: 780; VI: ; %bb.0: 781; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 782; VI-NEXT: s_add_u32 s34, s4, 32 783; VI-NEXT: s_addc_u32 s35, s5, 0 784; VI-NEXT: v_mov_b32_e32 v2, s34 785; VI-NEXT: v_mov_b32_e32 v0, s6 786; VI-NEXT: v_mov_b32_e32 v1, s7 787; VI-NEXT: v_mov_b32_e32 v3, s35 788; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc 789; VI-NEXT: s_waitcnt vmcnt(0) 790; VI-NEXT: buffer_wbinvl1_vol 791; VI-NEXT: s_setpc_b64 s[30:31] 792; 793; GFX9-LABEL: global_atomic_xchg_f64_ret_offset_scalar: 794; GFX9: ; %bb.0: 795; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 796; GFX9-NEXT: v_mov_b32_e32 v0, s6 797; GFX9-NEXT: v_mov_b32_e32 v1, s7 798; GFX9-NEXT: v_mov_b32_e32 v2, 0 799; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 800; GFX9-NEXT: s_waitcnt vmcnt(0) 801; GFX9-NEXT: buffer_wbinvl1_vol 802; GFX9-NEXT: s_setpc_b64 s[30:31] 803 %gep = getelementptr double, ptr addrspace(1) %out, i32 4 804 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst 805 ret double %result 806} 807 808define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, double %in) { 809; SI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: 810; SI: ; %bb.0: 811; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 812; SI-NEXT: s_mov_b32 s6, 0 813; SI-NEXT: s_mov_b32 s7, 0xf000 814; SI-NEXT: s_mov_b32 s4, s6 815; SI-NEXT: s_mov_b32 s5, s6 816; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 817; SI-NEXT: s_waitcnt vmcnt(0) 818; SI-NEXT: buffer_wbinvl1 819; SI-NEXT: s_waitcnt expcnt(0) 820; SI-NEXT: s_setpc_b64 s[30:31] 821; 822; VI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: 823; VI: ; %bb.0: 824; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 825; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 826; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 827; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] 828; VI-NEXT: s_waitcnt vmcnt(0) 829; VI-NEXT: buffer_wbinvl1_vol 830; VI-NEXT: s_setpc_b64 s[30:31] 831; 832; GFX9-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: 833; GFX9: ; %bb.0: 834; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 835; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:16 836; GFX9-NEXT: s_waitcnt vmcnt(0) 837; GFX9-NEXT: buffer_wbinvl1_vol 838; GFX9-NEXT: s_setpc_b64 s[30:31] 839 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 840 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory !0 841 ret void 842} 843 844define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, double %in) { 845; SI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: 846; SI: ; %bb.0: 847; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 848; SI-NEXT: s_mov_b32 s6, 0 849; SI-NEXT: s_mov_b32 s7, 0xf000 850; SI-NEXT: s_mov_b32 s4, s6 851; SI-NEXT: s_mov_b32 s5, s6 852; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 glc 853; SI-NEXT: s_waitcnt vmcnt(0) 854; SI-NEXT: buffer_wbinvl1 855; SI-NEXT: v_mov_b32_e32 v0, v2 856; SI-NEXT: v_mov_b32_e32 v1, v3 857; SI-NEXT: s_waitcnt expcnt(0) 858; SI-NEXT: s_setpc_b64 s[30:31] 859; 860; VI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: 861; VI: ; %bb.0: 862; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 863; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 864; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 865; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc 866; VI-NEXT: s_waitcnt vmcnt(0) 867; VI-NEXT: buffer_wbinvl1_vol 868; VI-NEXT: s_setpc_b64 s[30:31] 869; 870; GFX9-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: 871; GFX9: ; %bb.0: 872; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 873; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:16 glc 874; GFX9-NEXT: s_waitcnt vmcnt(0) 875; GFX9-NEXT: buffer_wbinvl1_vol 876; GFX9-NEXT: s_setpc_b64 s[30:31] 877 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 878 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory !0 879 ret double %result 880} 881 882; --------------------------------------------------------------------- 883; atomicrmw add 884; --------------------------------------------------------------------- 885 886define void @global_atomic_add_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 887; SI-LABEL: global_atomic_add_i64_noret: 888; SI: ; %bb.0: 889; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 890; SI-NEXT: s_mov_b32 s6, 0 891; SI-NEXT: s_mov_b32 s7, 0xf000 892; SI-NEXT: s_mov_b32 s4, s6 893; SI-NEXT: s_mov_b32 s5, s6 894; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 895; SI-NEXT: s_waitcnt vmcnt(0) 896; SI-NEXT: buffer_wbinvl1 897; SI-NEXT: s_waitcnt expcnt(0) 898; SI-NEXT: s_setpc_b64 s[30:31] 899; 900; VI-LABEL: global_atomic_add_i64_noret: 901; VI: ; %bb.0: 902; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 903; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] 904; VI-NEXT: s_waitcnt vmcnt(0) 905; VI-NEXT: buffer_wbinvl1_vol 906; VI-NEXT: s_setpc_b64 s[30:31] 907; 908; GFX9-LABEL: global_atomic_add_i64_noret: 909; GFX9: ; %bb.0: 910; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 911; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off 912; GFX9-NEXT: s_waitcnt vmcnt(0) 913; GFX9-NEXT: buffer_wbinvl1_vol 914; GFX9-NEXT: s_setpc_b64 s[30:31] 915 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst 916 ret void 917} 918 919define void @global_atomic_add_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 920; SI-LABEL: global_atomic_add_i64_noret_offset: 921; SI: ; %bb.0: 922; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 923; SI-NEXT: s_mov_b32 s6, 0 924; SI-NEXT: s_mov_b32 s7, 0xf000 925; SI-NEXT: s_mov_b32 s4, s6 926; SI-NEXT: s_mov_b32 s5, s6 927; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 928; SI-NEXT: s_waitcnt vmcnt(0) 929; SI-NEXT: buffer_wbinvl1 930; SI-NEXT: s_waitcnt expcnt(0) 931; SI-NEXT: s_setpc_b64 s[30:31] 932; 933; VI-LABEL: global_atomic_add_i64_noret_offset: 934; VI: ; %bb.0: 935; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 936; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 937; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 938; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] 939; VI-NEXT: s_waitcnt vmcnt(0) 940; VI-NEXT: buffer_wbinvl1_vol 941; VI-NEXT: s_setpc_b64 s[30:31] 942; 943; GFX9-LABEL: global_atomic_add_i64_noret_offset: 944; GFX9: ; %bb.0: 945; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 946; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off offset:32 947; GFX9-NEXT: s_waitcnt vmcnt(0) 948; GFX9-NEXT: buffer_wbinvl1_vol 949; GFX9-NEXT: s_setpc_b64 s[30:31] 950 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 951 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst 952 ret void 953} 954 955define i64 @global_atomic_add_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 956; SI-LABEL: global_atomic_add_i64_ret: 957; SI: ; %bb.0: 958; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 959; SI-NEXT: s_mov_b32 s6, 0 960; SI-NEXT: s_mov_b32 s7, 0xf000 961; SI-NEXT: s_mov_b32 s4, s6 962; SI-NEXT: s_mov_b32 s5, s6 963; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 964; SI-NEXT: s_waitcnt vmcnt(0) 965; SI-NEXT: buffer_wbinvl1 966; SI-NEXT: v_mov_b32_e32 v0, v2 967; SI-NEXT: v_mov_b32_e32 v1, v3 968; SI-NEXT: s_waitcnt expcnt(0) 969; SI-NEXT: s_setpc_b64 s[30:31] 970; 971; VI-LABEL: global_atomic_add_i64_ret: 972; VI: ; %bb.0: 973; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 974; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc 975; VI-NEXT: s_waitcnt vmcnt(0) 976; VI-NEXT: buffer_wbinvl1_vol 977; VI-NEXT: s_setpc_b64 s[30:31] 978; 979; GFX9-LABEL: global_atomic_add_i64_ret: 980; GFX9: ; %bb.0: 981; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 982; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off glc 983; GFX9-NEXT: s_waitcnt vmcnt(0) 984; GFX9-NEXT: buffer_wbinvl1_vol 985; GFX9-NEXT: s_setpc_b64 s[30:31] 986 %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst 987 ret i64 %result 988} 989 990define i64 @global_atomic_add_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 991; SI-LABEL: global_atomic_add_i64_ret_offset: 992; SI: ; %bb.0: 993; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 994; SI-NEXT: s_mov_b32 s6, 0 995; SI-NEXT: s_mov_b32 s7, 0xf000 996; SI-NEXT: s_mov_b32 s4, s6 997; SI-NEXT: s_mov_b32 s5, s6 998; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 999; SI-NEXT: s_waitcnt vmcnt(0) 1000; SI-NEXT: buffer_wbinvl1 1001; SI-NEXT: v_mov_b32_e32 v0, v2 1002; SI-NEXT: v_mov_b32_e32 v1, v3 1003; SI-NEXT: s_waitcnt expcnt(0) 1004; SI-NEXT: s_setpc_b64 s[30:31] 1005; 1006; VI-LABEL: global_atomic_add_i64_ret_offset: 1007; VI: ; %bb.0: 1008; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1009; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1010; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1011; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc 1012; VI-NEXT: s_waitcnt vmcnt(0) 1013; VI-NEXT: buffer_wbinvl1_vol 1014; VI-NEXT: s_setpc_b64 s[30:31] 1015; 1016; GFX9-LABEL: global_atomic_add_i64_ret_offset: 1017; GFX9: ; %bb.0: 1018; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1019; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 1020; GFX9-NEXT: s_waitcnt vmcnt(0) 1021; GFX9-NEXT: buffer_wbinvl1_vol 1022; GFX9-NEXT: s_setpc_b64 s[30:31] 1023 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1024 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst 1025 ret i64 %result 1026} 1027 1028define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 1029; SI-LABEL: global_atomic_add_i64_noret_scalar: 1030; SI: ; %bb.0: 1031; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1032; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1033; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1034; SI-NEXT: s_mov_b64 exec, s[34:35] 1035; SI-NEXT: s_waitcnt expcnt(0) 1036; SI-NEXT: v_writelane_b32 v2, s6, 0 1037; SI-NEXT: v_writelane_b32 v2, s7, 1 1038; SI-NEXT: s_mov_b32 s34, s7 1039; SI-NEXT: s_mov_b32 s35, s6 1040; SI-NEXT: s_mov_b32 s7, 0xf000 1041; SI-NEXT: s_mov_b32 s6, -1 1042; SI-NEXT: v_mov_b32_e32 v0, s35 1043; SI-NEXT: v_mov_b32_e32 v1, s34 1044; SI-NEXT: s_waitcnt vmcnt(0) 1045; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 1046; SI-NEXT: s_waitcnt vmcnt(0) 1047; SI-NEXT: buffer_wbinvl1 1048; SI-NEXT: v_readlane_b32 s7, v2, 1 1049; SI-NEXT: v_readlane_b32 s6, v2, 0 1050; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1051; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1052; SI-NEXT: s_mov_b64 exec, s[34:35] 1053; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1054; SI-NEXT: s_setpc_b64 s[30:31] 1055; 1056; VI-LABEL: global_atomic_add_i64_noret_scalar: 1057; VI: ; %bb.0: 1058; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1059; VI-NEXT: v_mov_b32_e32 v0, s6 1060; VI-NEXT: v_mov_b32_e32 v1, s7 1061; VI-NEXT: v_mov_b32_e32 v2, s4 1062; VI-NEXT: v_mov_b32_e32 v3, s5 1063; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] 1064; VI-NEXT: s_waitcnt vmcnt(0) 1065; VI-NEXT: buffer_wbinvl1_vol 1066; VI-NEXT: s_setpc_b64 s[30:31] 1067; 1068; GFX9-LABEL: global_atomic_add_i64_noret_scalar: 1069; GFX9: ; %bb.0: 1070; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1071; GFX9-NEXT: v_mov_b32_e32 v0, s6 1072; GFX9-NEXT: v_mov_b32_e32 v1, s7 1073; GFX9-NEXT: v_mov_b32_e32 v2, 0 1074; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] 1075; GFX9-NEXT: s_waitcnt vmcnt(0) 1076; GFX9-NEXT: buffer_wbinvl1_vol 1077; GFX9-NEXT: s_setpc_b64 s[30:31] 1078 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst 1079 ret void 1080} 1081 1082define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 1083; SI-LABEL: global_atomic_add_i64_noret_offset_scalar: 1084; SI: ; %bb.0: 1085; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1086; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1087; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1088; SI-NEXT: s_mov_b64 exec, s[34:35] 1089; SI-NEXT: s_waitcnt expcnt(0) 1090; SI-NEXT: v_writelane_b32 v2, s6, 0 1091; SI-NEXT: v_writelane_b32 v2, s7, 1 1092; SI-NEXT: v_mov_b32_e32 v0, s6 1093; SI-NEXT: v_mov_b32_e32 v1, s7 1094; SI-NEXT: s_mov_b32 s7, 0xf000 1095; SI-NEXT: s_mov_b32 s6, -1 1096; SI-NEXT: s_waitcnt vmcnt(0) 1097; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 1098; SI-NEXT: s_waitcnt vmcnt(0) 1099; SI-NEXT: buffer_wbinvl1 1100; SI-NEXT: v_readlane_b32 s7, v2, 1 1101; SI-NEXT: v_readlane_b32 s6, v2, 0 1102; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1103; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1104; SI-NEXT: s_mov_b64 exec, s[34:35] 1105; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1106; SI-NEXT: s_setpc_b64 s[30:31] 1107; 1108; VI-LABEL: global_atomic_add_i64_noret_offset_scalar: 1109; VI: ; %bb.0: 1110; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1111; VI-NEXT: s_add_u32 s34, s4, 32 1112; VI-NEXT: s_addc_u32 s35, s5, 0 1113; VI-NEXT: v_mov_b32_e32 v2, s34 1114; VI-NEXT: v_mov_b32_e32 v0, s6 1115; VI-NEXT: v_mov_b32_e32 v1, s7 1116; VI-NEXT: v_mov_b32_e32 v3, s35 1117; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] 1118; VI-NEXT: s_waitcnt vmcnt(0) 1119; VI-NEXT: buffer_wbinvl1_vol 1120; VI-NEXT: s_setpc_b64 s[30:31] 1121; 1122; GFX9-LABEL: global_atomic_add_i64_noret_offset_scalar: 1123; GFX9: ; %bb.0: 1124; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1125; GFX9-NEXT: v_mov_b32_e32 v0, s6 1126; GFX9-NEXT: v_mov_b32_e32 v1, s7 1127; GFX9-NEXT: v_mov_b32_e32 v2, 0 1128; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32 1129; GFX9-NEXT: s_waitcnt vmcnt(0) 1130; GFX9-NEXT: buffer_wbinvl1_vol 1131; GFX9-NEXT: s_setpc_b64 s[30:31] 1132 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1133 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst 1134 ret void 1135} 1136 1137define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 1138; SI-LABEL: global_atomic_add_i64_ret_scalar: 1139; SI: ; %bb.0: 1140; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1141; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1142; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1143; SI-NEXT: s_mov_b64 exec, s[34:35] 1144; SI-NEXT: s_waitcnt expcnt(0) 1145; SI-NEXT: v_writelane_b32 v2, s6, 0 1146; SI-NEXT: v_writelane_b32 v2, s7, 1 1147; SI-NEXT: s_mov_b32 s34, s7 1148; SI-NEXT: s_mov_b32 s35, s6 1149; SI-NEXT: s_mov_b32 s7, 0xf000 1150; SI-NEXT: s_mov_b32 s6, -1 1151; SI-NEXT: v_mov_b32_e32 v0, s35 1152; SI-NEXT: v_mov_b32_e32 v1, s34 1153; SI-NEXT: s_waitcnt vmcnt(0) 1154; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc 1155; SI-NEXT: s_waitcnt vmcnt(0) 1156; SI-NEXT: buffer_wbinvl1 1157; SI-NEXT: v_readlane_b32 s7, v2, 1 1158; SI-NEXT: v_readlane_b32 s6, v2, 0 1159; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1160; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1161; SI-NEXT: s_mov_b64 exec, s[34:35] 1162; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1163; SI-NEXT: s_setpc_b64 s[30:31] 1164; 1165; VI-LABEL: global_atomic_add_i64_ret_scalar: 1166; VI: ; %bb.0: 1167; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1168; VI-NEXT: v_mov_b32_e32 v0, s6 1169; VI-NEXT: v_mov_b32_e32 v1, s7 1170; VI-NEXT: v_mov_b32_e32 v2, s4 1171; VI-NEXT: v_mov_b32_e32 v3, s5 1172; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc 1173; VI-NEXT: s_waitcnt vmcnt(0) 1174; VI-NEXT: buffer_wbinvl1_vol 1175; VI-NEXT: s_setpc_b64 s[30:31] 1176; 1177; GFX9-LABEL: global_atomic_add_i64_ret_scalar: 1178; GFX9: ; %bb.0: 1179; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1180; GFX9-NEXT: v_mov_b32_e32 v0, s6 1181; GFX9-NEXT: v_mov_b32_e32 v1, s7 1182; GFX9-NEXT: v_mov_b32_e32 v2, 0 1183; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc 1184; GFX9-NEXT: s_waitcnt vmcnt(0) 1185; GFX9-NEXT: buffer_wbinvl1_vol 1186; GFX9-NEXT: s_setpc_b64 s[30:31] 1187 %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst 1188 ret i64 %result 1189} 1190 1191define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 1192; SI-LABEL: global_atomic_add_i64_ret_offset_scalar: 1193; SI: ; %bb.0: 1194; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1195; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1196; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1197; SI-NEXT: s_mov_b64 exec, s[34:35] 1198; SI-NEXT: s_waitcnt expcnt(0) 1199; SI-NEXT: v_writelane_b32 v2, s6, 0 1200; SI-NEXT: v_writelane_b32 v2, s7, 1 1201; SI-NEXT: v_mov_b32_e32 v0, s6 1202; SI-NEXT: v_mov_b32_e32 v1, s7 1203; SI-NEXT: s_mov_b32 s7, 0xf000 1204; SI-NEXT: s_mov_b32 s6, -1 1205; SI-NEXT: s_waitcnt vmcnt(0) 1206; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc 1207; SI-NEXT: s_waitcnt vmcnt(0) 1208; SI-NEXT: buffer_wbinvl1 1209; SI-NEXT: v_readlane_b32 s7, v2, 1 1210; SI-NEXT: v_readlane_b32 s6, v2, 0 1211; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1212; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1213; SI-NEXT: s_mov_b64 exec, s[34:35] 1214; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1215; SI-NEXT: s_setpc_b64 s[30:31] 1216; 1217; VI-LABEL: global_atomic_add_i64_ret_offset_scalar: 1218; VI: ; %bb.0: 1219; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1220; VI-NEXT: s_add_u32 s34, s4, 32 1221; VI-NEXT: s_addc_u32 s35, s5, 0 1222; VI-NEXT: v_mov_b32_e32 v2, s34 1223; VI-NEXT: v_mov_b32_e32 v0, s6 1224; VI-NEXT: v_mov_b32_e32 v1, s7 1225; VI-NEXT: v_mov_b32_e32 v3, s35 1226; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc 1227; VI-NEXT: s_waitcnt vmcnt(0) 1228; VI-NEXT: buffer_wbinvl1_vol 1229; VI-NEXT: s_setpc_b64 s[30:31] 1230; 1231; GFX9-LABEL: global_atomic_add_i64_ret_offset_scalar: 1232; GFX9: ; %bb.0: 1233; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1234; GFX9-NEXT: v_mov_b32_e32 v0, s6 1235; GFX9-NEXT: v_mov_b32_e32 v1, s7 1236; GFX9-NEXT: v_mov_b32_e32 v2, 0 1237; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 1238; GFX9-NEXT: s_waitcnt vmcnt(0) 1239; GFX9-NEXT: buffer_wbinvl1_vol 1240; GFX9-NEXT: s_setpc_b64 s[30:31] 1241 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1242 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst 1243 ret i64 %result 1244} 1245 1246define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 1247; SI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: 1248; SI: ; %bb.0: 1249; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1250; SI-NEXT: s_mov_b32 s6, 0 1251; SI-NEXT: s_mov_b32 s7, 0xf000 1252; SI-NEXT: s_mov_b32 s4, s6 1253; SI-NEXT: s_mov_b32 s5, s6 1254; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 1255; SI-NEXT: s_waitcnt vmcnt(0) 1256; SI-NEXT: buffer_wbinvl1 1257; SI-NEXT: s_waitcnt expcnt(0) 1258; SI-NEXT: s_setpc_b64 s[30:31] 1259; 1260; VI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: 1261; VI: ; %bb.0: 1262; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1263; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1264; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1265; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] 1266; VI-NEXT: s_waitcnt vmcnt(0) 1267; VI-NEXT: buffer_wbinvl1_vol 1268; VI-NEXT: s_setpc_b64 s[30:31] 1269; 1270; GFX9-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: 1271; GFX9: ; %bb.0: 1272; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1273; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off offset:32 1274; GFX9-NEXT: s_waitcnt vmcnt(0) 1275; GFX9-NEXT: buffer_wbinvl1_vol 1276; GFX9-NEXT: s_setpc_b64 s[30:31] 1277 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1278 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 1279 ret void 1280} 1281 1282define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 1283; SI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: 1284; SI: ; %bb.0: 1285; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1286; SI-NEXT: s_mov_b32 s6, 0 1287; SI-NEXT: s_mov_b32 s7, 0xf000 1288; SI-NEXT: s_mov_b32 s4, s6 1289; SI-NEXT: s_mov_b32 s5, s6 1290; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 1291; SI-NEXT: s_waitcnt vmcnt(0) 1292; SI-NEXT: buffer_wbinvl1 1293; SI-NEXT: v_mov_b32_e32 v0, v2 1294; SI-NEXT: v_mov_b32_e32 v1, v3 1295; SI-NEXT: s_waitcnt expcnt(0) 1296; SI-NEXT: s_setpc_b64 s[30:31] 1297; 1298; VI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: 1299; VI: ; %bb.0: 1300; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1301; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1302; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1303; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc 1304; VI-NEXT: s_waitcnt vmcnt(0) 1305; VI-NEXT: buffer_wbinvl1_vol 1306; VI-NEXT: s_setpc_b64 s[30:31] 1307; 1308; GFX9-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: 1309; GFX9: ; %bb.0: 1310; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1311; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 1312; GFX9-NEXT: s_waitcnt vmcnt(0) 1313; GFX9-NEXT: buffer_wbinvl1_vol 1314; GFX9-NEXT: s_setpc_b64 s[30:31] 1315 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1316 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 1317 ret i64 %result 1318} 1319 1320; --------------------------------------------------------------------- 1321; atomicrmw sub 1322; --------------------------------------------------------------------- 1323 1324define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 1325; SI-LABEL: global_atomic_sub_i64_noret: 1326; SI: ; %bb.0: 1327; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1328; SI-NEXT: s_mov_b32 s6, 0 1329; SI-NEXT: s_mov_b32 s7, 0xf000 1330; SI-NEXT: s_mov_b32 s4, s6 1331; SI-NEXT: s_mov_b32 s5, s6 1332; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 1333; SI-NEXT: s_waitcnt vmcnt(0) 1334; SI-NEXT: buffer_wbinvl1 1335; SI-NEXT: s_waitcnt expcnt(0) 1336; SI-NEXT: s_setpc_b64 s[30:31] 1337; 1338; VI-LABEL: global_atomic_sub_i64_noret: 1339; VI: ; %bb.0: 1340; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1341; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] 1342; VI-NEXT: s_waitcnt vmcnt(0) 1343; VI-NEXT: buffer_wbinvl1_vol 1344; VI-NEXT: s_setpc_b64 s[30:31] 1345; 1346; GFX9-LABEL: global_atomic_sub_i64_noret: 1347; GFX9: ; %bb.0: 1348; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1349; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off 1350; GFX9-NEXT: s_waitcnt vmcnt(0) 1351; GFX9-NEXT: buffer_wbinvl1_vol 1352; GFX9-NEXT: s_setpc_b64 s[30:31] 1353 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst 1354 ret void 1355} 1356 1357define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 1358; SI-LABEL: global_atomic_sub_i64_noret_offset: 1359; SI: ; %bb.0: 1360; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1361; SI-NEXT: s_mov_b32 s6, 0 1362; SI-NEXT: s_mov_b32 s7, 0xf000 1363; SI-NEXT: s_mov_b32 s4, s6 1364; SI-NEXT: s_mov_b32 s5, s6 1365; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 1366; SI-NEXT: s_waitcnt vmcnt(0) 1367; SI-NEXT: buffer_wbinvl1 1368; SI-NEXT: s_waitcnt expcnt(0) 1369; SI-NEXT: s_setpc_b64 s[30:31] 1370; 1371; VI-LABEL: global_atomic_sub_i64_noret_offset: 1372; VI: ; %bb.0: 1373; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1374; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1375; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1376; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] 1377; VI-NEXT: s_waitcnt vmcnt(0) 1378; VI-NEXT: buffer_wbinvl1_vol 1379; VI-NEXT: s_setpc_b64 s[30:31] 1380; 1381; GFX9-LABEL: global_atomic_sub_i64_noret_offset: 1382; GFX9: ; %bb.0: 1383; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1384; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 1385; GFX9-NEXT: s_waitcnt vmcnt(0) 1386; GFX9-NEXT: buffer_wbinvl1_vol 1387; GFX9-NEXT: s_setpc_b64 s[30:31] 1388 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1389 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst 1390 ret void 1391} 1392 1393define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 1394; SI-LABEL: global_atomic_sub_i64_ret: 1395; SI: ; %bb.0: 1396; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1397; SI-NEXT: s_mov_b32 s6, 0 1398; SI-NEXT: s_mov_b32 s7, 0xf000 1399; SI-NEXT: s_mov_b32 s4, s6 1400; SI-NEXT: s_mov_b32 s5, s6 1401; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 1402; SI-NEXT: s_waitcnt vmcnt(0) 1403; SI-NEXT: buffer_wbinvl1 1404; SI-NEXT: v_mov_b32_e32 v0, v2 1405; SI-NEXT: v_mov_b32_e32 v1, v3 1406; SI-NEXT: s_waitcnt expcnt(0) 1407; SI-NEXT: s_setpc_b64 s[30:31] 1408; 1409; VI-LABEL: global_atomic_sub_i64_ret: 1410; VI: ; %bb.0: 1411; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1412; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc 1413; VI-NEXT: s_waitcnt vmcnt(0) 1414; VI-NEXT: buffer_wbinvl1_vol 1415; VI-NEXT: s_setpc_b64 s[30:31] 1416; 1417; GFX9-LABEL: global_atomic_sub_i64_ret: 1418; GFX9: ; %bb.0: 1419; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1420; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc 1421; GFX9-NEXT: s_waitcnt vmcnt(0) 1422; GFX9-NEXT: buffer_wbinvl1_vol 1423; GFX9-NEXT: s_setpc_b64 s[30:31] 1424 %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst 1425 ret i64 %result 1426} 1427 1428define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 1429; SI-LABEL: global_atomic_sub_i64_ret_offset: 1430; SI: ; %bb.0: 1431; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1432; SI-NEXT: s_mov_b32 s6, 0 1433; SI-NEXT: s_mov_b32 s7, 0xf000 1434; SI-NEXT: s_mov_b32 s4, s6 1435; SI-NEXT: s_mov_b32 s5, s6 1436; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 1437; SI-NEXT: s_waitcnt vmcnt(0) 1438; SI-NEXT: buffer_wbinvl1 1439; SI-NEXT: v_mov_b32_e32 v0, v2 1440; SI-NEXT: v_mov_b32_e32 v1, v3 1441; SI-NEXT: s_waitcnt expcnt(0) 1442; SI-NEXT: s_setpc_b64 s[30:31] 1443; 1444; VI-LABEL: global_atomic_sub_i64_ret_offset: 1445; VI: ; %bb.0: 1446; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1447; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1448; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1449; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc 1450; VI-NEXT: s_waitcnt vmcnt(0) 1451; VI-NEXT: buffer_wbinvl1_vol 1452; VI-NEXT: s_setpc_b64 s[30:31] 1453; 1454; GFX9-LABEL: global_atomic_sub_i64_ret_offset: 1455; GFX9: ; %bb.0: 1456; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1457; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 1458; GFX9-NEXT: s_waitcnt vmcnt(0) 1459; GFX9-NEXT: buffer_wbinvl1_vol 1460; GFX9-NEXT: s_setpc_b64 s[30:31] 1461 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1462 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst 1463 ret i64 %result 1464} 1465 1466define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 1467; SI-LABEL: global_atomic_sub_i64_noret_scalar: 1468; SI: ; %bb.0: 1469; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1470; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1471; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1472; SI-NEXT: s_mov_b64 exec, s[34:35] 1473; SI-NEXT: s_waitcnt expcnt(0) 1474; SI-NEXT: v_writelane_b32 v2, s6, 0 1475; SI-NEXT: v_writelane_b32 v2, s7, 1 1476; SI-NEXT: s_mov_b32 s34, s7 1477; SI-NEXT: s_mov_b32 s35, s6 1478; SI-NEXT: s_mov_b32 s7, 0xf000 1479; SI-NEXT: s_mov_b32 s6, -1 1480; SI-NEXT: v_mov_b32_e32 v0, s35 1481; SI-NEXT: v_mov_b32_e32 v1, s34 1482; SI-NEXT: s_waitcnt vmcnt(0) 1483; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 1484; SI-NEXT: s_waitcnt vmcnt(0) 1485; SI-NEXT: buffer_wbinvl1 1486; SI-NEXT: v_readlane_b32 s7, v2, 1 1487; SI-NEXT: v_readlane_b32 s6, v2, 0 1488; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1489; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1490; SI-NEXT: s_mov_b64 exec, s[34:35] 1491; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1492; SI-NEXT: s_setpc_b64 s[30:31] 1493; 1494; VI-LABEL: global_atomic_sub_i64_noret_scalar: 1495; VI: ; %bb.0: 1496; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1497; VI-NEXT: v_mov_b32_e32 v0, s6 1498; VI-NEXT: v_mov_b32_e32 v1, s7 1499; VI-NEXT: v_mov_b32_e32 v2, s4 1500; VI-NEXT: v_mov_b32_e32 v3, s5 1501; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] 1502; VI-NEXT: s_waitcnt vmcnt(0) 1503; VI-NEXT: buffer_wbinvl1_vol 1504; VI-NEXT: s_setpc_b64 s[30:31] 1505; 1506; GFX9-LABEL: global_atomic_sub_i64_noret_scalar: 1507; GFX9: ; %bb.0: 1508; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1509; GFX9-NEXT: v_mov_b32_e32 v0, s6 1510; GFX9-NEXT: v_mov_b32_e32 v1, s7 1511; GFX9-NEXT: v_mov_b32_e32 v2, 0 1512; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] 1513; GFX9-NEXT: s_waitcnt vmcnt(0) 1514; GFX9-NEXT: buffer_wbinvl1_vol 1515; GFX9-NEXT: s_setpc_b64 s[30:31] 1516 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst 1517 ret void 1518} 1519 1520define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 1521; SI-LABEL: global_atomic_sub_i64_noret_offset_scalar: 1522; SI: ; %bb.0: 1523; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1524; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1525; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1526; SI-NEXT: s_mov_b64 exec, s[34:35] 1527; SI-NEXT: s_waitcnt expcnt(0) 1528; SI-NEXT: v_writelane_b32 v2, s6, 0 1529; SI-NEXT: v_writelane_b32 v2, s7, 1 1530; SI-NEXT: v_mov_b32_e32 v0, s6 1531; SI-NEXT: v_mov_b32_e32 v1, s7 1532; SI-NEXT: s_mov_b32 s7, 0xf000 1533; SI-NEXT: s_mov_b32 s6, -1 1534; SI-NEXT: s_waitcnt vmcnt(0) 1535; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 1536; SI-NEXT: s_waitcnt vmcnt(0) 1537; SI-NEXT: buffer_wbinvl1 1538; SI-NEXT: v_readlane_b32 s7, v2, 1 1539; SI-NEXT: v_readlane_b32 s6, v2, 0 1540; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1541; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1542; SI-NEXT: s_mov_b64 exec, s[34:35] 1543; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1544; SI-NEXT: s_setpc_b64 s[30:31] 1545; 1546; VI-LABEL: global_atomic_sub_i64_noret_offset_scalar: 1547; VI: ; %bb.0: 1548; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1549; VI-NEXT: s_add_u32 s34, s4, 32 1550; VI-NEXT: s_addc_u32 s35, s5, 0 1551; VI-NEXT: v_mov_b32_e32 v2, s34 1552; VI-NEXT: v_mov_b32_e32 v0, s6 1553; VI-NEXT: v_mov_b32_e32 v1, s7 1554; VI-NEXT: v_mov_b32_e32 v3, s35 1555; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] 1556; VI-NEXT: s_waitcnt vmcnt(0) 1557; VI-NEXT: buffer_wbinvl1_vol 1558; VI-NEXT: s_setpc_b64 s[30:31] 1559; 1560; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar: 1561; GFX9: ; %bb.0: 1562; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1563; GFX9-NEXT: v_mov_b32_e32 v0, s6 1564; GFX9-NEXT: v_mov_b32_e32 v1, s7 1565; GFX9-NEXT: v_mov_b32_e32 v2, 0 1566; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 1567; GFX9-NEXT: s_waitcnt vmcnt(0) 1568; GFX9-NEXT: buffer_wbinvl1_vol 1569; GFX9-NEXT: s_setpc_b64 s[30:31] 1570 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1571 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst 1572 ret void 1573} 1574 1575define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 1576; SI-LABEL: global_atomic_sub_i64_ret_scalar: 1577; SI: ; %bb.0: 1578; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1579; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1580; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1581; SI-NEXT: s_mov_b64 exec, s[34:35] 1582; SI-NEXT: s_waitcnt expcnt(0) 1583; SI-NEXT: v_writelane_b32 v2, s6, 0 1584; SI-NEXT: v_writelane_b32 v2, s7, 1 1585; SI-NEXT: s_mov_b32 s34, s7 1586; SI-NEXT: s_mov_b32 s35, s6 1587; SI-NEXT: s_mov_b32 s7, 0xf000 1588; SI-NEXT: s_mov_b32 s6, -1 1589; SI-NEXT: v_mov_b32_e32 v0, s35 1590; SI-NEXT: v_mov_b32_e32 v1, s34 1591; SI-NEXT: s_waitcnt vmcnt(0) 1592; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc 1593; SI-NEXT: s_waitcnt vmcnt(0) 1594; SI-NEXT: buffer_wbinvl1 1595; SI-NEXT: v_readlane_b32 s7, v2, 1 1596; SI-NEXT: v_readlane_b32 s6, v2, 0 1597; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1598; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1599; SI-NEXT: s_mov_b64 exec, s[34:35] 1600; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1601; SI-NEXT: s_setpc_b64 s[30:31] 1602; 1603; VI-LABEL: global_atomic_sub_i64_ret_scalar: 1604; VI: ; %bb.0: 1605; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1606; VI-NEXT: v_mov_b32_e32 v0, s6 1607; VI-NEXT: v_mov_b32_e32 v1, s7 1608; VI-NEXT: v_mov_b32_e32 v2, s4 1609; VI-NEXT: v_mov_b32_e32 v3, s5 1610; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc 1611; VI-NEXT: s_waitcnt vmcnt(0) 1612; VI-NEXT: buffer_wbinvl1_vol 1613; VI-NEXT: s_setpc_b64 s[30:31] 1614; 1615; GFX9-LABEL: global_atomic_sub_i64_ret_scalar: 1616; GFX9: ; %bb.0: 1617; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1618; GFX9-NEXT: v_mov_b32_e32 v0, s6 1619; GFX9-NEXT: v_mov_b32_e32 v1, s7 1620; GFX9-NEXT: v_mov_b32_e32 v2, 0 1621; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc 1622; GFX9-NEXT: s_waitcnt vmcnt(0) 1623; GFX9-NEXT: buffer_wbinvl1_vol 1624; GFX9-NEXT: s_setpc_b64 s[30:31] 1625 %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst 1626 ret i64 %result 1627} 1628 1629define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 1630; SI-LABEL: global_atomic_sub_i64_ret_offset_scalar: 1631; SI: ; %bb.0: 1632; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1633; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1634; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1635; SI-NEXT: s_mov_b64 exec, s[34:35] 1636; SI-NEXT: s_waitcnt expcnt(0) 1637; SI-NEXT: v_writelane_b32 v2, s6, 0 1638; SI-NEXT: v_writelane_b32 v2, s7, 1 1639; SI-NEXT: v_mov_b32_e32 v0, s6 1640; SI-NEXT: v_mov_b32_e32 v1, s7 1641; SI-NEXT: s_mov_b32 s7, 0xf000 1642; SI-NEXT: s_mov_b32 s6, -1 1643; SI-NEXT: s_waitcnt vmcnt(0) 1644; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc 1645; SI-NEXT: s_waitcnt vmcnt(0) 1646; SI-NEXT: buffer_wbinvl1 1647; SI-NEXT: v_readlane_b32 s7, v2, 1 1648; SI-NEXT: v_readlane_b32 s6, v2, 0 1649; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1650; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1651; SI-NEXT: s_mov_b64 exec, s[34:35] 1652; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1653; SI-NEXT: s_setpc_b64 s[30:31] 1654; 1655; VI-LABEL: global_atomic_sub_i64_ret_offset_scalar: 1656; VI: ; %bb.0: 1657; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1658; VI-NEXT: s_add_u32 s34, s4, 32 1659; VI-NEXT: s_addc_u32 s35, s5, 0 1660; VI-NEXT: v_mov_b32_e32 v2, s34 1661; VI-NEXT: v_mov_b32_e32 v0, s6 1662; VI-NEXT: v_mov_b32_e32 v1, s7 1663; VI-NEXT: v_mov_b32_e32 v3, s35 1664; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc 1665; VI-NEXT: s_waitcnt vmcnt(0) 1666; VI-NEXT: buffer_wbinvl1_vol 1667; VI-NEXT: s_setpc_b64 s[30:31] 1668; 1669; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar: 1670; GFX9: ; %bb.0: 1671; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1672; GFX9-NEXT: v_mov_b32_e32 v0, s6 1673; GFX9-NEXT: v_mov_b32_e32 v1, s7 1674; GFX9-NEXT: v_mov_b32_e32 v2, 0 1675; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 1676; GFX9-NEXT: s_waitcnt vmcnt(0) 1677; GFX9-NEXT: buffer_wbinvl1_vol 1678; GFX9-NEXT: s_setpc_b64 s[30:31] 1679 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1680 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst 1681 ret i64 %result 1682} 1683 1684define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 1685; SI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: 1686; SI: ; %bb.0: 1687; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1688; SI-NEXT: s_mov_b32 s6, 0 1689; SI-NEXT: s_mov_b32 s7, 0xf000 1690; SI-NEXT: s_mov_b32 s4, s6 1691; SI-NEXT: s_mov_b32 s5, s6 1692; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 1693; SI-NEXT: s_waitcnt vmcnt(0) 1694; SI-NEXT: buffer_wbinvl1 1695; SI-NEXT: s_waitcnt expcnt(0) 1696; SI-NEXT: s_setpc_b64 s[30:31] 1697; 1698; VI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: 1699; VI: ; %bb.0: 1700; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1701; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1702; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1703; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] 1704; VI-NEXT: s_waitcnt vmcnt(0) 1705; VI-NEXT: buffer_wbinvl1_vol 1706; VI-NEXT: s_setpc_b64 s[30:31] 1707; 1708; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: 1709; GFX9: ; %bb.0: 1710; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1711; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 1712; GFX9-NEXT: s_waitcnt vmcnt(0) 1713; GFX9-NEXT: buffer_wbinvl1_vol 1714; GFX9-NEXT: s_setpc_b64 s[30:31] 1715 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1716 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 1717 ret void 1718} 1719 1720define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 1721; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: 1722; SI: ; %bb.0: 1723; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1724; SI-NEXT: s_mov_b32 s6, 0 1725; SI-NEXT: s_mov_b32 s7, 0xf000 1726; SI-NEXT: s_mov_b32 s4, s6 1727; SI-NEXT: s_mov_b32 s5, s6 1728; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 1729; SI-NEXT: s_waitcnt vmcnt(0) 1730; SI-NEXT: buffer_wbinvl1 1731; SI-NEXT: v_mov_b32_e32 v0, v2 1732; SI-NEXT: v_mov_b32_e32 v1, v3 1733; SI-NEXT: s_waitcnt expcnt(0) 1734; SI-NEXT: s_setpc_b64 s[30:31] 1735; 1736; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: 1737; VI: ; %bb.0: 1738; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1739; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1740; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1741; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc 1742; VI-NEXT: s_waitcnt vmcnt(0) 1743; VI-NEXT: buffer_wbinvl1_vol 1744; VI-NEXT: s_setpc_b64 s[30:31] 1745; 1746; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: 1747; GFX9: ; %bb.0: 1748; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1749; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 1750; GFX9-NEXT: s_waitcnt vmcnt(0) 1751; GFX9-NEXT: buffer_wbinvl1_vol 1752; GFX9-NEXT: s_setpc_b64 s[30:31] 1753 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1754 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 1755 ret i64 %result 1756} 1757 1758; --------------------------------------------------------------------- 1759; atomicrmw and 1760; --------------------------------------------------------------------- 1761 1762define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 1763; SI-LABEL: global_atomic_and_i64_noret: 1764; SI: ; %bb.0: 1765; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1766; SI-NEXT: s_mov_b32 s6, 0 1767; SI-NEXT: s_mov_b32 s7, 0xf000 1768; SI-NEXT: s_mov_b32 s4, s6 1769; SI-NEXT: s_mov_b32 s5, s6 1770; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 1771; SI-NEXT: s_waitcnt vmcnt(0) 1772; SI-NEXT: buffer_wbinvl1 1773; SI-NEXT: s_waitcnt expcnt(0) 1774; SI-NEXT: s_setpc_b64 s[30:31] 1775; 1776; VI-LABEL: global_atomic_and_i64_noret: 1777; VI: ; %bb.0: 1778; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1779; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] 1780; VI-NEXT: s_waitcnt vmcnt(0) 1781; VI-NEXT: buffer_wbinvl1_vol 1782; VI-NEXT: s_setpc_b64 s[30:31] 1783; 1784; GFX9-LABEL: global_atomic_and_i64_noret: 1785; GFX9: ; %bb.0: 1786; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1787; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off 1788; GFX9-NEXT: s_waitcnt vmcnt(0) 1789; GFX9-NEXT: buffer_wbinvl1_vol 1790; GFX9-NEXT: s_setpc_b64 s[30:31] 1791 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst 1792 ret void 1793} 1794 1795define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 1796; SI-LABEL: global_atomic_and_i64_noret_offset: 1797; SI: ; %bb.0: 1798; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1799; SI-NEXT: s_mov_b32 s6, 0 1800; SI-NEXT: s_mov_b32 s7, 0xf000 1801; SI-NEXT: s_mov_b32 s4, s6 1802; SI-NEXT: s_mov_b32 s5, s6 1803; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 1804; SI-NEXT: s_waitcnt vmcnt(0) 1805; SI-NEXT: buffer_wbinvl1 1806; SI-NEXT: s_waitcnt expcnt(0) 1807; SI-NEXT: s_setpc_b64 s[30:31] 1808; 1809; VI-LABEL: global_atomic_and_i64_noret_offset: 1810; VI: ; %bb.0: 1811; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1812; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1813; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1814; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] 1815; VI-NEXT: s_waitcnt vmcnt(0) 1816; VI-NEXT: buffer_wbinvl1_vol 1817; VI-NEXT: s_setpc_b64 s[30:31] 1818; 1819; GFX9-LABEL: global_atomic_and_i64_noret_offset: 1820; GFX9: ; %bb.0: 1821; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1822; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 1823; GFX9-NEXT: s_waitcnt vmcnt(0) 1824; GFX9-NEXT: buffer_wbinvl1_vol 1825; GFX9-NEXT: s_setpc_b64 s[30:31] 1826 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1827 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst 1828 ret void 1829} 1830 1831define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 1832; SI-LABEL: global_atomic_and_i64_ret: 1833; SI: ; %bb.0: 1834; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1835; SI-NEXT: s_mov_b32 s6, 0 1836; SI-NEXT: s_mov_b32 s7, 0xf000 1837; SI-NEXT: s_mov_b32 s4, s6 1838; SI-NEXT: s_mov_b32 s5, s6 1839; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 1840; SI-NEXT: s_waitcnt vmcnt(0) 1841; SI-NEXT: buffer_wbinvl1 1842; SI-NEXT: v_mov_b32_e32 v0, v2 1843; SI-NEXT: v_mov_b32_e32 v1, v3 1844; SI-NEXT: s_waitcnt expcnt(0) 1845; SI-NEXT: s_setpc_b64 s[30:31] 1846; 1847; VI-LABEL: global_atomic_and_i64_ret: 1848; VI: ; %bb.0: 1849; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1850; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc 1851; VI-NEXT: s_waitcnt vmcnt(0) 1852; VI-NEXT: buffer_wbinvl1_vol 1853; VI-NEXT: s_setpc_b64 s[30:31] 1854; 1855; GFX9-LABEL: global_atomic_and_i64_ret: 1856; GFX9: ; %bb.0: 1857; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1858; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc 1859; GFX9-NEXT: s_waitcnt vmcnt(0) 1860; GFX9-NEXT: buffer_wbinvl1_vol 1861; GFX9-NEXT: s_setpc_b64 s[30:31] 1862 %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst 1863 ret i64 %result 1864} 1865 1866define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 1867; SI-LABEL: global_atomic_and_i64_ret_offset: 1868; SI: ; %bb.0: 1869; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1870; SI-NEXT: s_mov_b32 s6, 0 1871; SI-NEXT: s_mov_b32 s7, 0xf000 1872; SI-NEXT: s_mov_b32 s4, s6 1873; SI-NEXT: s_mov_b32 s5, s6 1874; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 1875; SI-NEXT: s_waitcnt vmcnt(0) 1876; SI-NEXT: buffer_wbinvl1 1877; SI-NEXT: v_mov_b32_e32 v0, v2 1878; SI-NEXT: v_mov_b32_e32 v1, v3 1879; SI-NEXT: s_waitcnt expcnt(0) 1880; SI-NEXT: s_setpc_b64 s[30:31] 1881; 1882; VI-LABEL: global_atomic_and_i64_ret_offset: 1883; VI: ; %bb.0: 1884; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1885; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1886; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1887; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc 1888; VI-NEXT: s_waitcnt vmcnt(0) 1889; VI-NEXT: buffer_wbinvl1_vol 1890; VI-NEXT: s_setpc_b64 s[30:31] 1891; 1892; GFX9-LABEL: global_atomic_and_i64_ret_offset: 1893; GFX9: ; %bb.0: 1894; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1895; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 1896; GFX9-NEXT: s_waitcnt vmcnt(0) 1897; GFX9-NEXT: buffer_wbinvl1_vol 1898; GFX9-NEXT: s_setpc_b64 s[30:31] 1899 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 1900 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst 1901 ret i64 %result 1902} 1903 1904define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 1905; SI-LABEL: global_atomic_and_i64_noret_scalar: 1906; SI: ; %bb.0: 1907; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1908; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1909; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1910; SI-NEXT: s_mov_b64 exec, s[34:35] 1911; SI-NEXT: s_waitcnt expcnt(0) 1912; SI-NEXT: v_writelane_b32 v2, s6, 0 1913; SI-NEXT: v_writelane_b32 v2, s7, 1 1914; SI-NEXT: s_mov_b32 s34, s7 1915; SI-NEXT: s_mov_b32 s35, s6 1916; SI-NEXT: s_mov_b32 s7, 0xf000 1917; SI-NEXT: s_mov_b32 s6, -1 1918; SI-NEXT: v_mov_b32_e32 v0, s35 1919; SI-NEXT: v_mov_b32_e32 v1, s34 1920; SI-NEXT: s_waitcnt vmcnt(0) 1921; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 1922; SI-NEXT: s_waitcnt vmcnt(0) 1923; SI-NEXT: buffer_wbinvl1 1924; SI-NEXT: v_readlane_b32 s7, v2, 1 1925; SI-NEXT: v_readlane_b32 s6, v2, 0 1926; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1927; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1928; SI-NEXT: s_mov_b64 exec, s[34:35] 1929; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1930; SI-NEXT: s_setpc_b64 s[30:31] 1931; 1932; VI-LABEL: global_atomic_and_i64_noret_scalar: 1933; VI: ; %bb.0: 1934; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1935; VI-NEXT: v_mov_b32_e32 v0, s6 1936; VI-NEXT: v_mov_b32_e32 v1, s7 1937; VI-NEXT: v_mov_b32_e32 v2, s4 1938; VI-NEXT: v_mov_b32_e32 v3, s5 1939; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] 1940; VI-NEXT: s_waitcnt vmcnt(0) 1941; VI-NEXT: buffer_wbinvl1_vol 1942; VI-NEXT: s_setpc_b64 s[30:31] 1943; 1944; GFX9-LABEL: global_atomic_and_i64_noret_scalar: 1945; GFX9: ; %bb.0: 1946; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1947; GFX9-NEXT: v_mov_b32_e32 v0, s6 1948; GFX9-NEXT: v_mov_b32_e32 v1, s7 1949; GFX9-NEXT: v_mov_b32_e32 v2, 0 1950; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] 1951; GFX9-NEXT: s_waitcnt vmcnt(0) 1952; GFX9-NEXT: buffer_wbinvl1_vol 1953; GFX9-NEXT: s_setpc_b64 s[30:31] 1954 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst 1955 ret void 1956} 1957 1958define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 1959; SI-LABEL: global_atomic_and_i64_noret_offset_scalar: 1960; SI: ; %bb.0: 1961; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1962; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1963; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 1964; SI-NEXT: s_mov_b64 exec, s[34:35] 1965; SI-NEXT: s_waitcnt expcnt(0) 1966; SI-NEXT: v_writelane_b32 v2, s6, 0 1967; SI-NEXT: v_writelane_b32 v2, s7, 1 1968; SI-NEXT: v_mov_b32_e32 v0, s6 1969; SI-NEXT: v_mov_b32_e32 v1, s7 1970; SI-NEXT: s_mov_b32 s7, 0xf000 1971; SI-NEXT: s_mov_b32 s6, -1 1972; SI-NEXT: s_waitcnt vmcnt(0) 1973; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 1974; SI-NEXT: s_waitcnt vmcnt(0) 1975; SI-NEXT: buffer_wbinvl1 1976; SI-NEXT: v_readlane_b32 s7, v2, 1 1977; SI-NEXT: v_readlane_b32 s6, v2, 0 1978; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 1979; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 1980; SI-NEXT: s_mov_b64 exec, s[34:35] 1981; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1982; SI-NEXT: s_setpc_b64 s[30:31] 1983; 1984; VI-LABEL: global_atomic_and_i64_noret_offset_scalar: 1985; VI: ; %bb.0: 1986; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1987; VI-NEXT: s_add_u32 s34, s4, 32 1988; VI-NEXT: s_addc_u32 s35, s5, 0 1989; VI-NEXT: v_mov_b32_e32 v2, s34 1990; VI-NEXT: v_mov_b32_e32 v0, s6 1991; VI-NEXT: v_mov_b32_e32 v1, s7 1992; VI-NEXT: v_mov_b32_e32 v3, s35 1993; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] 1994; VI-NEXT: s_waitcnt vmcnt(0) 1995; VI-NEXT: buffer_wbinvl1_vol 1996; VI-NEXT: s_setpc_b64 s[30:31] 1997; 1998; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar: 1999; GFX9: ; %bb.0: 2000; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2001; GFX9-NEXT: v_mov_b32_e32 v0, s6 2002; GFX9-NEXT: v_mov_b32_e32 v1, s7 2003; GFX9-NEXT: v_mov_b32_e32 v2, 0 2004; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 2005; GFX9-NEXT: s_waitcnt vmcnt(0) 2006; GFX9-NEXT: buffer_wbinvl1_vol 2007; GFX9-NEXT: s_setpc_b64 s[30:31] 2008 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2009 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst 2010 ret void 2011} 2012 2013define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 2014; SI-LABEL: global_atomic_and_i64_ret_scalar: 2015; SI: ; %bb.0: 2016; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2017; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2018; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 2019; SI-NEXT: s_mov_b64 exec, s[34:35] 2020; SI-NEXT: s_waitcnt expcnt(0) 2021; SI-NEXT: v_writelane_b32 v2, s6, 0 2022; SI-NEXT: v_writelane_b32 v2, s7, 1 2023; SI-NEXT: s_mov_b32 s34, s7 2024; SI-NEXT: s_mov_b32 s35, s6 2025; SI-NEXT: s_mov_b32 s7, 0xf000 2026; SI-NEXT: s_mov_b32 s6, -1 2027; SI-NEXT: v_mov_b32_e32 v0, s35 2028; SI-NEXT: v_mov_b32_e32 v1, s34 2029; SI-NEXT: s_waitcnt vmcnt(0) 2030; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc 2031; SI-NEXT: s_waitcnt vmcnt(0) 2032; SI-NEXT: buffer_wbinvl1 2033; SI-NEXT: v_readlane_b32 s7, v2, 1 2034; SI-NEXT: v_readlane_b32 s6, v2, 0 2035; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2036; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 2037; SI-NEXT: s_mov_b64 exec, s[34:35] 2038; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2039; SI-NEXT: s_setpc_b64 s[30:31] 2040; 2041; VI-LABEL: global_atomic_and_i64_ret_scalar: 2042; VI: ; %bb.0: 2043; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2044; VI-NEXT: v_mov_b32_e32 v0, s6 2045; VI-NEXT: v_mov_b32_e32 v1, s7 2046; VI-NEXT: v_mov_b32_e32 v2, s4 2047; VI-NEXT: v_mov_b32_e32 v3, s5 2048; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc 2049; VI-NEXT: s_waitcnt vmcnt(0) 2050; VI-NEXT: buffer_wbinvl1_vol 2051; VI-NEXT: s_setpc_b64 s[30:31] 2052; 2053; GFX9-LABEL: global_atomic_and_i64_ret_scalar: 2054; GFX9: ; %bb.0: 2055; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2056; GFX9-NEXT: v_mov_b32_e32 v0, s6 2057; GFX9-NEXT: v_mov_b32_e32 v1, s7 2058; GFX9-NEXT: v_mov_b32_e32 v2, 0 2059; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc 2060; GFX9-NEXT: s_waitcnt vmcnt(0) 2061; GFX9-NEXT: buffer_wbinvl1_vol 2062; GFX9-NEXT: s_setpc_b64 s[30:31] 2063 %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst 2064 ret i64 %result 2065} 2066 2067define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 2068; SI-LABEL: global_atomic_and_i64_ret_offset_scalar: 2069; SI: ; %bb.0: 2070; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2071; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2072; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 2073; SI-NEXT: s_mov_b64 exec, s[34:35] 2074; SI-NEXT: s_waitcnt expcnt(0) 2075; SI-NEXT: v_writelane_b32 v2, s6, 0 2076; SI-NEXT: v_writelane_b32 v2, s7, 1 2077; SI-NEXT: v_mov_b32_e32 v0, s6 2078; SI-NEXT: v_mov_b32_e32 v1, s7 2079; SI-NEXT: s_mov_b32 s7, 0xf000 2080; SI-NEXT: s_mov_b32 s6, -1 2081; SI-NEXT: s_waitcnt vmcnt(0) 2082; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc 2083; SI-NEXT: s_waitcnt vmcnt(0) 2084; SI-NEXT: buffer_wbinvl1 2085; SI-NEXT: v_readlane_b32 s7, v2, 1 2086; SI-NEXT: v_readlane_b32 s6, v2, 0 2087; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2088; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 2089; SI-NEXT: s_mov_b64 exec, s[34:35] 2090; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2091; SI-NEXT: s_setpc_b64 s[30:31] 2092; 2093; VI-LABEL: global_atomic_and_i64_ret_offset_scalar: 2094; VI: ; %bb.0: 2095; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2096; VI-NEXT: s_add_u32 s34, s4, 32 2097; VI-NEXT: s_addc_u32 s35, s5, 0 2098; VI-NEXT: v_mov_b32_e32 v2, s34 2099; VI-NEXT: v_mov_b32_e32 v0, s6 2100; VI-NEXT: v_mov_b32_e32 v1, s7 2101; VI-NEXT: v_mov_b32_e32 v3, s35 2102; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc 2103; VI-NEXT: s_waitcnt vmcnt(0) 2104; VI-NEXT: buffer_wbinvl1_vol 2105; VI-NEXT: s_setpc_b64 s[30:31] 2106; 2107; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar: 2108; GFX9: ; %bb.0: 2109; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2110; GFX9-NEXT: v_mov_b32_e32 v0, s6 2111; GFX9-NEXT: v_mov_b32_e32 v1, s7 2112; GFX9-NEXT: v_mov_b32_e32 v2, 0 2113; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 2114; GFX9-NEXT: s_waitcnt vmcnt(0) 2115; GFX9-NEXT: buffer_wbinvl1_vol 2116; GFX9-NEXT: s_setpc_b64 s[30:31] 2117 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2118 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst 2119 ret i64 %result 2120} 2121 2122define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 2123; SI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: 2124; SI: ; %bb.0: 2125; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2126; SI-NEXT: s_mov_b32 s6, 0 2127; SI-NEXT: s_mov_b32 s7, 0xf000 2128; SI-NEXT: s_mov_b32 s4, s6 2129; SI-NEXT: s_mov_b32 s5, s6 2130; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 2131; SI-NEXT: s_waitcnt vmcnt(0) 2132; SI-NEXT: buffer_wbinvl1 2133; SI-NEXT: s_waitcnt expcnt(0) 2134; SI-NEXT: s_setpc_b64 s[30:31] 2135; 2136; VI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: 2137; VI: ; %bb.0: 2138; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2139; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 2140; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2141; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] 2142; VI-NEXT: s_waitcnt vmcnt(0) 2143; VI-NEXT: buffer_wbinvl1_vol 2144; VI-NEXT: s_setpc_b64 s[30:31] 2145; 2146; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: 2147; GFX9: ; %bb.0: 2148; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2149; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 2150; GFX9-NEXT: s_waitcnt vmcnt(0) 2151; GFX9-NEXT: buffer_wbinvl1_vol 2152; GFX9-NEXT: s_setpc_b64 s[30:31] 2153 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2154 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 2155 ret void 2156} 2157 2158define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 2159; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: 2160; SI: ; %bb.0: 2161; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2162; SI-NEXT: s_mov_b32 s6, 0 2163; SI-NEXT: s_mov_b32 s7, 0xf000 2164; SI-NEXT: s_mov_b32 s4, s6 2165; SI-NEXT: s_mov_b32 s5, s6 2166; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 2167; SI-NEXT: s_waitcnt vmcnt(0) 2168; SI-NEXT: buffer_wbinvl1 2169; SI-NEXT: v_mov_b32_e32 v0, v2 2170; SI-NEXT: v_mov_b32_e32 v1, v3 2171; SI-NEXT: s_waitcnt expcnt(0) 2172; SI-NEXT: s_setpc_b64 s[30:31] 2173; 2174; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: 2175; VI: ; %bb.0: 2176; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2177; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 2178; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2179; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc 2180; VI-NEXT: s_waitcnt vmcnt(0) 2181; VI-NEXT: buffer_wbinvl1_vol 2182; VI-NEXT: s_setpc_b64 s[30:31] 2183; 2184; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: 2185; GFX9: ; %bb.0: 2186; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2187; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 2188; GFX9-NEXT: s_waitcnt vmcnt(0) 2189; GFX9-NEXT: buffer_wbinvl1_vol 2190; GFX9-NEXT: s_setpc_b64 s[30:31] 2191 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2192 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 2193 ret i64 %result 2194} 2195 2196; --------------------------------------------------------------------- 2197; atomicrmw nand 2198; --------------------------------------------------------------------- 2199 2200define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 2201; SI-LABEL: global_atomic_nand_i64_noret: 2202; SI: ; %bb.0: 2203; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2204; SI-NEXT: s_mov_b32 s6, 0 2205; SI-NEXT: s_mov_b32 s7, 0xf000 2206; SI-NEXT: s_mov_b32 s4, s6 2207; SI-NEXT: s_mov_b32 s5, s6 2208; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 2209; SI-NEXT: s_mov_b64 s[8:9], 0 2210; SI-NEXT: .LBB50_1: ; %atomicrmw.start 2211; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2212; SI-NEXT: s_waitcnt vmcnt(0) 2213; SI-NEXT: v_and_b32_e32 v4, v7, v3 2214; SI-NEXT: s_waitcnt expcnt(0) 2215; SI-NEXT: v_and_b32_e32 v8, v6, v2 2216; SI-NEXT: v_not_b32_e32 v5, v4 2217; SI-NEXT: v_not_b32_e32 v4, v8 2218; SI-NEXT: v_mov_b32_e32 v11, v7 2219; SI-NEXT: v_mov_b32_e32 v10, v6 2220; SI-NEXT: v_mov_b32_e32 v9, v5 2221; SI-NEXT: v_mov_b32_e32 v8, v4 2222; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc 2223; SI-NEXT: s_waitcnt vmcnt(0) 2224; SI-NEXT: buffer_wbinvl1 2225; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 2226; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2227; SI-NEXT: v_mov_b32_e32 v6, v8 2228; SI-NEXT: v_mov_b32_e32 v7, v9 2229; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2230; SI-NEXT: s_cbranch_execnz .LBB50_1 2231; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2232; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2233; SI-NEXT: s_waitcnt expcnt(0) 2234; SI-NEXT: s_setpc_b64 s[30:31] 2235; 2236; VI-LABEL: global_atomic_nand_i64_noret: 2237; VI: ; %bb.0: 2238; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2239; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 2240; VI-NEXT: s_mov_b64 s[4:5], 0 2241; VI-NEXT: .LBB50_1: ; %atomicrmw.start 2242; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2243; VI-NEXT: s_waitcnt vmcnt(0) 2244; VI-NEXT: v_and_b32_e32 v4, v7, v3 2245; VI-NEXT: v_and_b32_e32 v8, v6, v2 2246; VI-NEXT: v_not_b32_e32 v5, v4 2247; VI-NEXT: v_not_b32_e32 v4, v8 2248; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 2249; VI-NEXT: s_waitcnt vmcnt(0) 2250; VI-NEXT: buffer_wbinvl1_vol 2251; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2252; VI-NEXT: v_mov_b32_e32 v7, v5 2253; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2254; VI-NEXT: v_mov_b32_e32 v6, v4 2255; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2256; VI-NEXT: s_cbranch_execnz .LBB50_1 2257; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2258; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2259; VI-NEXT: s_setpc_b64 s[30:31] 2260; 2261; GFX9-LABEL: global_atomic_nand_i64_noret: 2262; GFX9: ; %bb.0: 2263; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2264; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2265; GFX9-NEXT: s_mov_b64 s[4:5], 0 2266; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start 2267; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2268; GFX9-NEXT: s_waitcnt vmcnt(0) 2269; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 2270; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 2271; GFX9-NEXT: v_not_b32_e32 v5, v4 2272; GFX9-NEXT: v_not_b32_e32 v4, v8 2273; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 2274; GFX9-NEXT: s_waitcnt vmcnt(0) 2275; GFX9-NEXT: buffer_wbinvl1_vol 2276; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2277; GFX9-NEXT: v_mov_b32_e32 v7, v5 2278; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2279; GFX9-NEXT: v_mov_b32_e32 v6, v4 2280; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2281; GFX9-NEXT: s_cbranch_execnz .LBB50_1 2282; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2283; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2284; GFX9-NEXT: s_setpc_b64 s[30:31] 2285 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst 2286 ret void 2287} 2288 2289define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 2290; SI-LABEL: global_atomic_nand_i64_noret_offset: 2291; SI: ; %bb.0: 2292; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2293; SI-NEXT: s_mov_b32 s6, 0 2294; SI-NEXT: s_mov_b32 s7, 0xf000 2295; SI-NEXT: s_mov_b32 s4, s6 2296; SI-NEXT: s_mov_b32 s5, s6 2297; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 2298; SI-NEXT: s_mov_b64 s[8:9], 0 2299; SI-NEXT: .LBB51_1: ; %atomicrmw.start 2300; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2301; SI-NEXT: s_waitcnt vmcnt(0) 2302; SI-NEXT: v_and_b32_e32 v4, v7, v3 2303; SI-NEXT: s_waitcnt expcnt(0) 2304; SI-NEXT: v_and_b32_e32 v8, v6, v2 2305; SI-NEXT: v_not_b32_e32 v5, v4 2306; SI-NEXT: v_not_b32_e32 v4, v8 2307; SI-NEXT: v_mov_b32_e32 v11, v7 2308; SI-NEXT: v_mov_b32_e32 v10, v6 2309; SI-NEXT: v_mov_b32_e32 v9, v5 2310; SI-NEXT: v_mov_b32_e32 v8, v4 2311; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 2312; SI-NEXT: s_waitcnt vmcnt(0) 2313; SI-NEXT: buffer_wbinvl1 2314; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 2315; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2316; SI-NEXT: v_mov_b32_e32 v6, v8 2317; SI-NEXT: v_mov_b32_e32 v7, v9 2318; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2319; SI-NEXT: s_cbranch_execnz .LBB51_1 2320; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2321; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2322; SI-NEXT: s_waitcnt expcnt(0) 2323; SI-NEXT: s_setpc_b64 s[30:31] 2324; 2325; VI-LABEL: global_atomic_nand_i64_noret_offset: 2326; VI: ; %bb.0: 2327; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2328; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 2329; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2330; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 2331; VI-NEXT: s_mov_b64 s[4:5], 0 2332; VI-NEXT: .LBB51_1: ; %atomicrmw.start 2333; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2334; VI-NEXT: s_waitcnt vmcnt(0) 2335; VI-NEXT: v_and_b32_e32 v4, v7, v3 2336; VI-NEXT: v_and_b32_e32 v8, v6, v2 2337; VI-NEXT: v_not_b32_e32 v5, v4 2338; VI-NEXT: v_not_b32_e32 v4, v8 2339; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 2340; VI-NEXT: s_waitcnt vmcnt(0) 2341; VI-NEXT: buffer_wbinvl1_vol 2342; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2343; VI-NEXT: v_mov_b32_e32 v7, v5 2344; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2345; VI-NEXT: v_mov_b32_e32 v6, v4 2346; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2347; VI-NEXT: s_cbranch_execnz .LBB51_1 2348; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2349; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2350; VI-NEXT: s_setpc_b64 s[30:31] 2351; 2352; GFX9-LABEL: global_atomic_nand_i64_noret_offset: 2353; GFX9: ; %bb.0: 2354; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2355; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 2356; GFX9-NEXT: s_mov_b64 s[4:5], 0 2357; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start 2358; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2359; GFX9-NEXT: s_waitcnt vmcnt(0) 2360; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 2361; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 2362; GFX9-NEXT: v_not_b32_e32 v5, v4 2363; GFX9-NEXT: v_not_b32_e32 v4, v8 2364; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 2365; GFX9-NEXT: s_waitcnt vmcnt(0) 2366; GFX9-NEXT: buffer_wbinvl1_vol 2367; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2368; GFX9-NEXT: v_mov_b32_e32 v7, v5 2369; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2370; GFX9-NEXT: v_mov_b32_e32 v6, v4 2371; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2372; GFX9-NEXT: s_cbranch_execnz .LBB51_1 2373; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2374; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2375; GFX9-NEXT: s_setpc_b64 s[30:31] 2376 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2377 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst 2378 ret void 2379} 2380 2381define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 2382; SI-LABEL: global_atomic_nand_i64_ret: 2383; SI: ; %bb.0: 2384; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2385; SI-NEXT: v_mov_b32_e32 v6, v3 2386; SI-NEXT: v_mov_b32_e32 v7, v2 2387; SI-NEXT: v_mov_b32_e32 v5, v1 2388; SI-NEXT: v_mov_b32_e32 v4, v0 2389; SI-NEXT: s_mov_b32 s6, 0 2390; SI-NEXT: s_mov_b32 s7, 0xf000 2391; SI-NEXT: s_mov_b32 s4, s6 2392; SI-NEXT: s_mov_b32 s5, s6 2393; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 2394; SI-NEXT: s_mov_b64 s[8:9], 0 2395; SI-NEXT: .LBB52_1: ; %atomicrmw.start 2396; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2397; SI-NEXT: s_waitcnt vmcnt(0) 2398; SI-NEXT: v_mov_b32_e32 v11, v1 2399; SI-NEXT: v_mov_b32_e32 v10, v0 2400; SI-NEXT: s_waitcnt expcnt(0) 2401; SI-NEXT: v_and_b32_e32 v0, v11, v6 2402; SI-NEXT: v_and_b32_e32 v1, v10, v7 2403; SI-NEXT: v_not_b32_e32 v9, v0 2404; SI-NEXT: v_not_b32_e32 v8, v1 2405; SI-NEXT: v_mov_b32_e32 v0, v8 2406; SI-NEXT: v_mov_b32_e32 v1, v9 2407; SI-NEXT: v_mov_b32_e32 v2, v10 2408; SI-NEXT: v_mov_b32_e32 v3, v11 2409; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc 2410; SI-NEXT: s_waitcnt vmcnt(0) 2411; SI-NEXT: buffer_wbinvl1 2412; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 2413; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2414; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2415; SI-NEXT: s_cbranch_execnz .LBB52_1 2416; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2417; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2418; SI-NEXT: s_waitcnt expcnt(0) 2419; SI-NEXT: s_setpc_b64 s[30:31] 2420; 2421; VI-LABEL: global_atomic_nand_i64_ret: 2422; VI: ; %bb.0: 2423; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2424; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 2425; VI-NEXT: s_mov_b64 s[4:5], 0 2426; VI-NEXT: .LBB52_1: ; %atomicrmw.start 2427; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2428; VI-NEXT: s_waitcnt vmcnt(0) 2429; VI-NEXT: v_mov_b32_e32 v7, v5 2430; VI-NEXT: v_mov_b32_e32 v6, v4 2431; VI-NEXT: v_and_b32_e32 v4, v7, v3 2432; VI-NEXT: v_and_b32_e32 v8, v6, v2 2433; VI-NEXT: v_not_b32_e32 v5, v4 2434; VI-NEXT: v_not_b32_e32 v4, v8 2435; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 2436; VI-NEXT: s_waitcnt vmcnt(0) 2437; VI-NEXT: buffer_wbinvl1_vol 2438; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2439; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2440; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2441; VI-NEXT: s_cbranch_execnz .LBB52_1 2442; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2443; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2444; VI-NEXT: v_mov_b32_e32 v0, v4 2445; VI-NEXT: v_mov_b32_e32 v1, v5 2446; VI-NEXT: s_setpc_b64 s[30:31] 2447; 2448; GFX9-LABEL: global_atomic_nand_i64_ret: 2449; GFX9: ; %bb.0: 2450; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2451; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 2452; GFX9-NEXT: s_mov_b64 s[4:5], 0 2453; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start 2454; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2455; GFX9-NEXT: s_waitcnt vmcnt(0) 2456; GFX9-NEXT: v_mov_b32_e32 v7, v5 2457; GFX9-NEXT: v_mov_b32_e32 v6, v4 2458; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 2459; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 2460; GFX9-NEXT: v_not_b32_e32 v5, v4 2461; GFX9-NEXT: v_not_b32_e32 v4, v8 2462; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 2463; GFX9-NEXT: s_waitcnt vmcnt(0) 2464; GFX9-NEXT: buffer_wbinvl1_vol 2465; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2466; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2467; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2468; GFX9-NEXT: s_cbranch_execnz .LBB52_1 2469; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2470; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2471; GFX9-NEXT: v_mov_b32_e32 v0, v4 2472; GFX9-NEXT: v_mov_b32_e32 v1, v5 2473; GFX9-NEXT: s_setpc_b64 s[30:31] 2474 %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst 2475 ret i64 %result 2476} 2477 2478define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 2479; SI-LABEL: global_atomic_nand_i64_ret_offset: 2480; SI: ; %bb.0: 2481; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2482; SI-NEXT: v_mov_b32_e32 v6, v3 2483; SI-NEXT: v_mov_b32_e32 v7, v2 2484; SI-NEXT: v_mov_b32_e32 v5, v1 2485; SI-NEXT: v_mov_b32_e32 v4, v0 2486; SI-NEXT: s_mov_b32 s6, 0 2487; SI-NEXT: s_mov_b32 s7, 0xf000 2488; SI-NEXT: s_mov_b32 s4, s6 2489; SI-NEXT: s_mov_b32 s5, s6 2490; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 2491; SI-NEXT: s_mov_b64 s[8:9], 0 2492; SI-NEXT: .LBB53_1: ; %atomicrmw.start 2493; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2494; SI-NEXT: s_waitcnt vmcnt(0) 2495; SI-NEXT: v_mov_b32_e32 v11, v1 2496; SI-NEXT: v_mov_b32_e32 v10, v0 2497; SI-NEXT: s_waitcnt expcnt(0) 2498; SI-NEXT: v_and_b32_e32 v0, v11, v6 2499; SI-NEXT: v_and_b32_e32 v1, v10, v7 2500; SI-NEXT: v_not_b32_e32 v9, v0 2501; SI-NEXT: v_not_b32_e32 v8, v1 2502; SI-NEXT: v_mov_b32_e32 v0, v8 2503; SI-NEXT: v_mov_b32_e32 v1, v9 2504; SI-NEXT: v_mov_b32_e32 v2, v10 2505; SI-NEXT: v_mov_b32_e32 v3, v11 2506; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc 2507; SI-NEXT: s_waitcnt vmcnt(0) 2508; SI-NEXT: buffer_wbinvl1 2509; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 2510; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 2511; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 2512; SI-NEXT: s_cbranch_execnz .LBB53_1 2513; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2514; SI-NEXT: s_or_b64 exec, exec, s[8:9] 2515; SI-NEXT: s_waitcnt expcnt(0) 2516; SI-NEXT: s_setpc_b64 s[30:31] 2517; 2518; VI-LABEL: global_atomic_nand_i64_ret_offset: 2519; VI: ; %bb.0: 2520; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2521; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 2522; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2523; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 2524; VI-NEXT: s_mov_b64 s[4:5], 0 2525; VI-NEXT: .LBB53_1: ; %atomicrmw.start 2526; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2527; VI-NEXT: s_waitcnt vmcnt(0) 2528; VI-NEXT: v_mov_b32_e32 v9, v1 2529; VI-NEXT: v_mov_b32_e32 v8, v0 2530; VI-NEXT: v_and_b32_e32 v0, v9, v3 2531; VI-NEXT: v_and_b32_e32 v1, v8, v2 2532; VI-NEXT: v_not_b32_e32 v7, v0 2533; VI-NEXT: v_not_b32_e32 v6, v1 2534; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 2535; VI-NEXT: s_waitcnt vmcnt(0) 2536; VI-NEXT: buffer_wbinvl1_vol 2537; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 2538; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2539; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 2540; VI-NEXT: s_cbranch_execnz .LBB53_1 2541; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2542; VI-NEXT: s_or_b64 exec, exec, s[4:5] 2543; VI-NEXT: s_setpc_b64 s[30:31] 2544; 2545; GFX9-LABEL: global_atomic_nand_i64_ret_offset: 2546; GFX9: ; %bb.0: 2547; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2548; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 2549; GFX9-NEXT: s_mov_b64 s[4:5], 0 2550; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start 2551; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2552; GFX9-NEXT: s_waitcnt vmcnt(0) 2553; GFX9-NEXT: v_mov_b32_e32 v7, v5 2554; GFX9-NEXT: v_mov_b32_e32 v6, v4 2555; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 2556; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 2557; GFX9-NEXT: v_not_b32_e32 v5, v4 2558; GFX9-NEXT: v_not_b32_e32 v4, v8 2559; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 2560; GFX9-NEXT: s_waitcnt vmcnt(0) 2561; GFX9-NEXT: buffer_wbinvl1_vol 2562; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 2563; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2564; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 2565; GFX9-NEXT: s_cbranch_execnz .LBB53_1 2566; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2567; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2568; GFX9-NEXT: v_mov_b32_e32 v0, v4 2569; GFX9-NEXT: v_mov_b32_e32 v1, v5 2570; GFX9-NEXT: s_setpc_b64 s[30:31] 2571 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2572 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst 2573 ret i64 %result 2574} 2575 2576define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 2577; SI-LABEL: global_atomic_nand_i64_noret_scalar: 2578; SI: ; %bb.0: 2579; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2580; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2581; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill 2582; SI-NEXT: s_mov_b64 exec, s[34:35] 2583; SI-NEXT: s_waitcnt expcnt(0) 2584; SI-NEXT: v_writelane_b32 v8, s6, 0 2585; SI-NEXT: v_writelane_b32 v8, s7, 1 2586; SI-NEXT: s_mov_b32 s34, s7 2587; SI-NEXT: s_mov_b32 s35, s6 2588; SI-NEXT: s_mov_b32 s7, 0xf000 2589; SI-NEXT: s_mov_b32 s6, -1 2590; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 2591; SI-NEXT: s_mov_b64 s[36:37], 0 2592; SI-NEXT: .LBB54_1: ; %atomicrmw.start 2593; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2594; SI-NEXT: s_waitcnt vmcnt(0) 2595; SI-NEXT: v_and_b32_e32 v0, s34, v3 2596; SI-NEXT: s_waitcnt expcnt(0) 2597; SI-NEXT: v_and_b32_e32 v4, s35, v2 2598; SI-NEXT: v_not_b32_e32 v1, v0 2599; SI-NEXT: v_not_b32_e32 v0, v4 2600; SI-NEXT: v_mov_b32_e32 v7, v3 2601; SI-NEXT: v_mov_b32_e32 v6, v2 2602; SI-NEXT: v_mov_b32_e32 v5, v1 2603; SI-NEXT: v_mov_b32_e32 v4, v0 2604; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc 2605; SI-NEXT: s_waitcnt vmcnt(0) 2606; SI-NEXT: buffer_wbinvl1 2607; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 2608; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2609; SI-NEXT: v_mov_b32_e32 v2, v4 2610; SI-NEXT: v_mov_b32_e32 v3, v5 2611; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2612; SI-NEXT: s_cbranch_execnz .LBB54_1 2613; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2614; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2615; SI-NEXT: v_readlane_b32 s7, v8, 1 2616; SI-NEXT: v_readlane_b32 s6, v8, 0 2617; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2618; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload 2619; SI-NEXT: s_mov_b64 exec, s[34:35] 2620; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2621; SI-NEXT: s_setpc_b64 s[30:31] 2622; 2623; VI-LABEL: global_atomic_nand_i64_noret_scalar: 2624; VI: ; %bb.0: 2625; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2626; VI-NEXT: v_mov_b32_e32 v0, s4 2627; VI-NEXT: v_mov_b32_e32 v1, s5 2628; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 2629; VI-NEXT: v_mov_b32_e32 v4, s4 2630; VI-NEXT: s_mov_b64 s[34:35], 0 2631; VI-NEXT: v_mov_b32_e32 v5, s5 2632; VI-NEXT: .LBB54_1: ; %atomicrmw.start 2633; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2634; VI-NEXT: s_waitcnt vmcnt(0) 2635; VI-NEXT: v_and_b32_e32 v0, s7, v3 2636; VI-NEXT: v_and_b32_e32 v6, s6, v2 2637; VI-NEXT: v_not_b32_e32 v1, v0 2638; VI-NEXT: v_not_b32_e32 v0, v6 2639; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 2640; VI-NEXT: s_waitcnt vmcnt(0) 2641; VI-NEXT: buffer_wbinvl1_vol 2642; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 2643; VI-NEXT: v_mov_b32_e32 v3, v1 2644; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2645; VI-NEXT: v_mov_b32_e32 v2, v0 2646; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2647; VI-NEXT: s_cbranch_execnz .LBB54_1 2648; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2649; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2650; VI-NEXT: s_setpc_b64 s[30:31] 2651; 2652; GFX9-LABEL: global_atomic_nand_i64_noret_scalar: 2653; GFX9: ; %bb.0: 2654; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2655; GFX9-NEXT: v_mov_b32_e32 v4, 0 2656; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 2657; GFX9-NEXT: s_mov_b64 s[34:35], 0 2658; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start 2659; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2660; GFX9-NEXT: s_waitcnt vmcnt(0) 2661; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 2662; GFX9-NEXT: v_and_b32_e32 v5, s6, v2 2663; GFX9-NEXT: v_not_b32_e32 v1, v0 2664; GFX9-NEXT: v_not_b32_e32 v0, v5 2665; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc 2666; GFX9-NEXT: s_waitcnt vmcnt(0) 2667; GFX9-NEXT: buffer_wbinvl1_vol 2668; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 2669; GFX9-NEXT: v_mov_b32_e32 v3, v1 2670; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2671; GFX9-NEXT: v_mov_b32_e32 v2, v0 2672; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2673; GFX9-NEXT: s_cbranch_execnz .LBB54_1 2674; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2675; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2676; GFX9-NEXT: s_setpc_b64 s[30:31] 2677 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst 2678 ret void 2679} 2680 2681define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 2682; SI-LABEL: global_atomic_nand_i64_noret_offset_scalar: 2683; SI: ; %bb.0: 2684; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2685; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2686; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill 2687; SI-NEXT: s_mov_b64 exec, s[34:35] 2688; SI-NEXT: s_waitcnt expcnt(0) 2689; SI-NEXT: v_writelane_b32 v8, s6, 0 2690; SI-NEXT: v_writelane_b32 v8, s7, 1 2691; SI-NEXT: s_mov_b32 s34, s7 2692; SI-NEXT: s_mov_b32 s35, s6 2693; SI-NEXT: s_mov_b32 s7, 0xf000 2694; SI-NEXT: s_mov_b32 s6, -1 2695; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 2696; SI-NEXT: s_mov_b64 s[36:37], 0 2697; SI-NEXT: .LBB55_1: ; %atomicrmw.start 2698; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2699; SI-NEXT: s_waitcnt vmcnt(0) 2700; SI-NEXT: v_and_b32_e32 v0, s34, v3 2701; SI-NEXT: s_waitcnt expcnt(0) 2702; SI-NEXT: v_and_b32_e32 v4, s35, v2 2703; SI-NEXT: v_not_b32_e32 v1, v0 2704; SI-NEXT: v_not_b32_e32 v0, v4 2705; SI-NEXT: v_mov_b32_e32 v7, v3 2706; SI-NEXT: v_mov_b32_e32 v6, v2 2707; SI-NEXT: v_mov_b32_e32 v5, v1 2708; SI-NEXT: v_mov_b32_e32 v4, v0 2709; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc 2710; SI-NEXT: s_waitcnt vmcnt(0) 2711; SI-NEXT: buffer_wbinvl1 2712; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 2713; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2714; SI-NEXT: v_mov_b32_e32 v2, v4 2715; SI-NEXT: v_mov_b32_e32 v3, v5 2716; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2717; SI-NEXT: s_cbranch_execnz .LBB55_1 2718; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2719; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2720; SI-NEXT: v_readlane_b32 s7, v8, 1 2721; SI-NEXT: v_readlane_b32 s6, v8, 0 2722; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2723; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload 2724; SI-NEXT: s_mov_b64 exec, s[34:35] 2725; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2726; SI-NEXT: s_setpc_b64 s[30:31] 2727; 2728; VI-LABEL: global_atomic_nand_i64_noret_offset_scalar: 2729; VI: ; %bb.0: 2730; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2731; VI-NEXT: s_add_u32 s34, s4, 32 2732; VI-NEXT: s_addc_u32 s35, s5, 0 2733; VI-NEXT: v_mov_b32_e32 v4, s34 2734; VI-NEXT: v_mov_b32_e32 v5, s35 2735; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] 2736; VI-NEXT: s_mov_b64 s[34:35], 0 2737; VI-NEXT: .LBB55_1: ; %atomicrmw.start 2738; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2739; VI-NEXT: s_waitcnt vmcnt(0) 2740; VI-NEXT: v_and_b32_e32 v0, s7, v3 2741; VI-NEXT: v_and_b32_e32 v6, s6, v2 2742; VI-NEXT: v_not_b32_e32 v1, v0 2743; VI-NEXT: v_not_b32_e32 v0, v6 2744; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 2745; VI-NEXT: s_waitcnt vmcnt(0) 2746; VI-NEXT: buffer_wbinvl1_vol 2747; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 2748; VI-NEXT: v_mov_b32_e32 v3, v1 2749; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2750; VI-NEXT: v_mov_b32_e32 v2, v0 2751; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2752; VI-NEXT: s_cbranch_execnz .LBB55_1 2753; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2754; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2755; VI-NEXT: s_setpc_b64 s[30:31] 2756; 2757; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar: 2758; GFX9: ; %bb.0: 2759; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2760; GFX9-NEXT: v_mov_b32_e32 v4, 0 2761; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 2762; GFX9-NEXT: s_mov_b64 s[34:35], 0 2763; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start 2764; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2765; GFX9-NEXT: s_waitcnt vmcnt(0) 2766; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 2767; GFX9-NEXT: v_and_b32_e32 v5, s6, v2 2768; GFX9-NEXT: v_not_b32_e32 v1, v0 2769; GFX9-NEXT: v_not_b32_e32 v0, v5 2770; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc 2771; GFX9-NEXT: s_waitcnt vmcnt(0) 2772; GFX9-NEXT: buffer_wbinvl1_vol 2773; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 2774; GFX9-NEXT: v_mov_b32_e32 v3, v1 2775; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2776; GFX9-NEXT: v_mov_b32_e32 v2, v0 2777; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2778; GFX9-NEXT: s_cbranch_execnz .LBB55_1 2779; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2780; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2781; GFX9-NEXT: s_setpc_b64 s[30:31] 2782 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2783 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst 2784 ret void 2785} 2786 2787define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 2788; SI-LABEL: global_atomic_nand_i64_ret_scalar: 2789; SI: ; %bb.0: 2790; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2791; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2792; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill 2793; SI-NEXT: s_mov_b64 exec, s[34:35] 2794; SI-NEXT: s_waitcnt expcnt(0) 2795; SI-NEXT: v_writelane_b32 v6, s6, 0 2796; SI-NEXT: v_writelane_b32 v6, s7, 1 2797; SI-NEXT: s_mov_b32 s34, s7 2798; SI-NEXT: s_mov_b32 s35, s6 2799; SI-NEXT: s_mov_b32 s7, 0xf000 2800; SI-NEXT: s_mov_b32 s6, -1 2801; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2802; SI-NEXT: s_mov_b64 s[36:37], 0 2803; SI-NEXT: .LBB56_1: ; %atomicrmw.start 2804; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2805; SI-NEXT: s_waitcnt vmcnt(0) 2806; SI-NEXT: v_mov_b32_e32 v5, v1 2807; SI-NEXT: v_mov_b32_e32 v4, v0 2808; SI-NEXT: s_waitcnt expcnt(0) 2809; SI-NEXT: v_and_b32_e32 v0, s34, v5 2810; SI-NEXT: v_and_b32_e32 v1, s35, v4 2811; SI-NEXT: v_not_b32_e32 v3, v0 2812; SI-NEXT: v_not_b32_e32 v2, v1 2813; SI-NEXT: v_mov_b32_e32 v0, v2 2814; SI-NEXT: v_mov_b32_e32 v1, v3 2815; SI-NEXT: v_mov_b32_e32 v2, v4 2816; SI-NEXT: v_mov_b32_e32 v3, v5 2817; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc 2818; SI-NEXT: s_waitcnt vmcnt(0) 2819; SI-NEXT: buffer_wbinvl1 2820; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] 2821; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2822; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2823; SI-NEXT: s_cbranch_execnz .LBB56_1 2824; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2825; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2826; SI-NEXT: v_readlane_b32 s7, v6, 1 2827; SI-NEXT: v_readlane_b32 s6, v6, 0 2828; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2829; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload 2830; SI-NEXT: s_mov_b64 exec, s[34:35] 2831; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2832; SI-NEXT: s_setpc_b64 s[30:31] 2833; 2834; VI-LABEL: global_atomic_nand_i64_ret_scalar: 2835; VI: ; %bb.0: 2836; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2837; VI-NEXT: v_mov_b32_e32 v0, s4 2838; VI-NEXT: v_mov_b32_e32 v1, s5 2839; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2840; VI-NEXT: v_mov_b32_e32 v2, s4 2841; VI-NEXT: s_mov_b64 s[34:35], 0 2842; VI-NEXT: v_mov_b32_e32 v3, s5 2843; VI-NEXT: .LBB56_1: ; %atomicrmw.start 2844; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2845; VI-NEXT: s_waitcnt vmcnt(0) 2846; VI-NEXT: v_mov_b32_e32 v7, v1 2847; VI-NEXT: v_mov_b32_e32 v6, v0 2848; VI-NEXT: v_and_b32_e32 v0, s7, v7 2849; VI-NEXT: v_and_b32_e32 v1, s6, v6 2850; VI-NEXT: v_not_b32_e32 v5, v0 2851; VI-NEXT: v_not_b32_e32 v4, v1 2852; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc 2853; VI-NEXT: s_waitcnt vmcnt(0) 2854; VI-NEXT: buffer_wbinvl1_vol 2855; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] 2856; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2857; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2858; VI-NEXT: s_cbranch_execnz .LBB56_1 2859; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2860; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2861; VI-NEXT: s_setpc_b64 s[30:31] 2862; 2863; GFX9-LABEL: global_atomic_nand_i64_ret_scalar: 2864; GFX9: ; %bb.0: 2865; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2866; GFX9-NEXT: v_mov_b32_e32 v2, 0 2867; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] 2868; GFX9-NEXT: s_mov_b64 s[34:35], 0 2869; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start 2870; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2871; GFX9-NEXT: s_waitcnt vmcnt(0) 2872; GFX9-NEXT: v_mov_b32_e32 v6, v1 2873; GFX9-NEXT: v_mov_b32_e32 v5, v0 2874; GFX9-NEXT: v_and_b32_e32 v0, s7, v6 2875; GFX9-NEXT: v_and_b32_e32 v1, s6, v5 2876; GFX9-NEXT: v_not_b32_e32 v4, v0 2877; GFX9-NEXT: v_not_b32_e32 v3, v1 2878; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc 2879; GFX9-NEXT: s_waitcnt vmcnt(0) 2880; GFX9-NEXT: buffer_wbinvl1_vol 2881; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] 2882; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2883; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2884; GFX9-NEXT: s_cbranch_execnz .LBB56_1 2885; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2886; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2887; GFX9-NEXT: s_setpc_b64 s[30:31] 2888 %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst 2889 ret i64 %result 2890} 2891 2892define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 2893; SI-LABEL: global_atomic_nand_i64_ret_offset_scalar: 2894; SI: ; %bb.0: 2895; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2896; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2897; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill 2898; SI-NEXT: s_mov_b64 exec, s[34:35] 2899; SI-NEXT: s_waitcnt expcnt(0) 2900; SI-NEXT: v_writelane_b32 v6, s6, 0 2901; SI-NEXT: v_writelane_b32 v6, s7, 1 2902; SI-NEXT: s_mov_b32 s34, s7 2903; SI-NEXT: s_mov_b32 s35, s6 2904; SI-NEXT: s_mov_b32 s7, 0xf000 2905; SI-NEXT: s_mov_b32 s6, -1 2906; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 2907; SI-NEXT: s_mov_b64 s[36:37], 0 2908; SI-NEXT: .LBB57_1: ; %atomicrmw.start 2909; SI-NEXT: ; =>This Inner Loop Header: Depth=1 2910; SI-NEXT: s_waitcnt vmcnt(0) 2911; SI-NEXT: v_mov_b32_e32 v5, v1 2912; SI-NEXT: v_mov_b32_e32 v4, v0 2913; SI-NEXT: s_waitcnt expcnt(0) 2914; SI-NEXT: v_and_b32_e32 v0, s34, v5 2915; SI-NEXT: v_and_b32_e32 v1, s35, v4 2916; SI-NEXT: v_not_b32_e32 v3, v0 2917; SI-NEXT: v_not_b32_e32 v2, v1 2918; SI-NEXT: v_mov_b32_e32 v0, v2 2919; SI-NEXT: v_mov_b32_e32 v1, v3 2920; SI-NEXT: v_mov_b32_e32 v2, v4 2921; SI-NEXT: v_mov_b32_e32 v3, v5 2922; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc 2923; SI-NEXT: s_waitcnt vmcnt(0) 2924; SI-NEXT: buffer_wbinvl1 2925; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] 2926; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 2927; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 2928; SI-NEXT: s_cbranch_execnz .LBB57_1 2929; SI-NEXT: ; %bb.2: ; %atomicrmw.end 2930; SI-NEXT: s_or_b64 exec, exec, s[36:37] 2931; SI-NEXT: v_readlane_b32 s7, v6, 1 2932; SI-NEXT: v_readlane_b32 s6, v6, 0 2933; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 2934; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload 2935; SI-NEXT: s_mov_b64 exec, s[34:35] 2936; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2937; SI-NEXT: s_setpc_b64 s[30:31] 2938; 2939; VI-LABEL: global_atomic_nand_i64_ret_offset_scalar: 2940; VI: ; %bb.0: 2941; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2942; VI-NEXT: s_add_u32 s34, s4, 32 2943; VI-NEXT: s_addc_u32 s35, s5, 0 2944; VI-NEXT: v_mov_b32_e32 v2, s34 2945; VI-NEXT: v_mov_b32_e32 v3, s35 2946; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] 2947; VI-NEXT: s_mov_b64 s[34:35], 0 2948; VI-NEXT: .LBB57_1: ; %atomicrmw.start 2949; VI-NEXT: ; =>This Inner Loop Header: Depth=1 2950; VI-NEXT: s_waitcnt vmcnt(0) 2951; VI-NEXT: v_mov_b32_e32 v7, v1 2952; VI-NEXT: v_mov_b32_e32 v6, v0 2953; VI-NEXT: v_and_b32_e32 v0, s7, v7 2954; VI-NEXT: v_and_b32_e32 v1, s6, v6 2955; VI-NEXT: v_not_b32_e32 v5, v0 2956; VI-NEXT: v_not_b32_e32 v4, v1 2957; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc 2958; VI-NEXT: s_waitcnt vmcnt(0) 2959; VI-NEXT: buffer_wbinvl1_vol 2960; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] 2961; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2962; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 2963; VI-NEXT: s_cbranch_execnz .LBB57_1 2964; VI-NEXT: ; %bb.2: ; %atomicrmw.end 2965; VI-NEXT: s_or_b64 exec, exec, s[34:35] 2966; VI-NEXT: s_setpc_b64 s[30:31] 2967; 2968; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar: 2969; GFX9: ; %bb.0: 2970; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2971; GFX9-NEXT: v_mov_b32_e32 v2, 0 2972; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 2973; GFX9-NEXT: s_mov_b64 s[34:35], 0 2974; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start 2975; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2976; GFX9-NEXT: s_waitcnt vmcnt(0) 2977; GFX9-NEXT: v_mov_b32_e32 v6, v1 2978; GFX9-NEXT: v_mov_b32_e32 v5, v0 2979; GFX9-NEXT: v_and_b32_e32 v0, s7, v6 2980; GFX9-NEXT: v_and_b32_e32 v1, s6, v5 2981; GFX9-NEXT: v_not_b32_e32 v4, v0 2982; GFX9-NEXT: v_not_b32_e32 v3, v1 2983; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc 2984; GFX9-NEXT: s_waitcnt vmcnt(0) 2985; GFX9-NEXT: buffer_wbinvl1_vol 2986; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] 2987; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2988; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 2989; GFX9-NEXT: s_cbranch_execnz .LBB57_1 2990; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 2991; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 2992; GFX9-NEXT: s_setpc_b64 s[30:31] 2993 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 2994 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst 2995 ret i64 %result 2996} 2997 2998define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 2999; SI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: 3000; SI: ; %bb.0: 3001; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3002; SI-NEXT: s_mov_b32 s6, 0 3003; SI-NEXT: s_mov_b32 s7, 0xf000 3004; SI-NEXT: s_mov_b32 s4, s6 3005; SI-NEXT: s_mov_b32 s5, s6 3006; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 3007; SI-NEXT: s_mov_b64 s[8:9], 0 3008; SI-NEXT: .LBB58_1: ; %atomicrmw.start 3009; SI-NEXT: ; =>This Inner Loop Header: Depth=1 3010; SI-NEXT: s_waitcnt vmcnt(0) 3011; SI-NEXT: v_and_b32_e32 v4, v7, v3 3012; SI-NEXT: s_waitcnt expcnt(0) 3013; SI-NEXT: v_and_b32_e32 v8, v6, v2 3014; SI-NEXT: v_not_b32_e32 v5, v4 3015; SI-NEXT: v_not_b32_e32 v4, v8 3016; SI-NEXT: v_mov_b32_e32 v11, v7 3017; SI-NEXT: v_mov_b32_e32 v10, v6 3018; SI-NEXT: v_mov_b32_e32 v9, v5 3019; SI-NEXT: v_mov_b32_e32 v8, v4 3020; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 3021; SI-NEXT: s_waitcnt vmcnt(0) 3022; SI-NEXT: buffer_wbinvl1 3023; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 3024; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 3025; SI-NEXT: v_mov_b32_e32 v6, v8 3026; SI-NEXT: v_mov_b32_e32 v7, v9 3027; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 3028; SI-NEXT: s_cbranch_execnz .LBB58_1 3029; SI-NEXT: ; %bb.2: ; %atomicrmw.end 3030; SI-NEXT: s_or_b64 exec, exec, s[8:9] 3031; SI-NEXT: s_waitcnt expcnt(0) 3032; SI-NEXT: s_setpc_b64 s[30:31] 3033; 3034; VI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: 3035; VI: ; %bb.0: 3036; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3037; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3038; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3039; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 3040; VI-NEXT: s_mov_b64 s[4:5], 0 3041; VI-NEXT: .LBB58_1: ; %atomicrmw.start 3042; VI-NEXT: ; =>This Inner Loop Header: Depth=1 3043; VI-NEXT: s_waitcnt vmcnt(0) 3044; VI-NEXT: v_and_b32_e32 v4, v7, v3 3045; VI-NEXT: v_and_b32_e32 v8, v6, v2 3046; VI-NEXT: v_not_b32_e32 v5, v4 3047; VI-NEXT: v_not_b32_e32 v4, v8 3048; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 3049; VI-NEXT: s_waitcnt vmcnt(0) 3050; VI-NEXT: buffer_wbinvl1_vol 3051; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3052; VI-NEXT: v_mov_b32_e32 v7, v5 3053; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3054; VI-NEXT: v_mov_b32_e32 v6, v4 3055; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 3056; VI-NEXT: s_cbranch_execnz .LBB58_1 3057; VI-NEXT: ; %bb.2: ; %atomicrmw.end 3058; VI-NEXT: s_or_b64 exec, exec, s[4:5] 3059; VI-NEXT: s_setpc_b64 s[30:31] 3060; 3061; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: 3062; GFX9: ; %bb.0: 3063; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3064; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 3065; GFX9-NEXT: s_mov_b64 s[4:5], 0 3066; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start 3067; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3068; GFX9-NEXT: s_waitcnt vmcnt(0) 3069; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 3070; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 3071; GFX9-NEXT: v_not_b32_e32 v5, v4 3072; GFX9-NEXT: v_not_b32_e32 v4, v8 3073; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 3074; GFX9-NEXT: s_waitcnt vmcnt(0) 3075; GFX9-NEXT: buffer_wbinvl1_vol 3076; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3077; GFX9-NEXT: v_mov_b32_e32 v7, v5 3078; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3079; GFX9-NEXT: v_mov_b32_e32 v6, v4 3080; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 3081; GFX9-NEXT: s_cbranch_execnz .LBB58_1 3082; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3083; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3084; GFX9-NEXT: s_setpc_b64 s[30:31] 3085 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3086 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 3087 ret void 3088} 3089 3090define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 3091; SI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: 3092; SI: ; %bb.0: 3093; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3094; SI-NEXT: v_mov_b32_e32 v6, v3 3095; SI-NEXT: v_mov_b32_e32 v7, v2 3096; SI-NEXT: v_mov_b32_e32 v5, v1 3097; SI-NEXT: v_mov_b32_e32 v4, v0 3098; SI-NEXT: s_mov_b32 s6, 0 3099; SI-NEXT: s_mov_b32 s7, 0xf000 3100; SI-NEXT: s_mov_b32 s4, s6 3101; SI-NEXT: s_mov_b32 s5, s6 3102; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 3103; SI-NEXT: s_mov_b64 s[8:9], 0 3104; SI-NEXT: .LBB59_1: ; %atomicrmw.start 3105; SI-NEXT: ; =>This Inner Loop Header: Depth=1 3106; SI-NEXT: s_waitcnt vmcnt(0) 3107; SI-NEXT: v_mov_b32_e32 v11, v1 3108; SI-NEXT: v_mov_b32_e32 v10, v0 3109; SI-NEXT: s_waitcnt expcnt(0) 3110; SI-NEXT: v_and_b32_e32 v0, v11, v6 3111; SI-NEXT: v_and_b32_e32 v1, v10, v7 3112; SI-NEXT: v_not_b32_e32 v9, v0 3113; SI-NEXT: v_not_b32_e32 v8, v1 3114; SI-NEXT: v_mov_b32_e32 v0, v8 3115; SI-NEXT: v_mov_b32_e32 v1, v9 3116; SI-NEXT: v_mov_b32_e32 v2, v10 3117; SI-NEXT: v_mov_b32_e32 v3, v11 3118; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc 3119; SI-NEXT: s_waitcnt vmcnt(0) 3120; SI-NEXT: buffer_wbinvl1 3121; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 3122; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 3123; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 3124; SI-NEXT: s_cbranch_execnz .LBB59_1 3125; SI-NEXT: ; %bb.2: ; %atomicrmw.end 3126; SI-NEXT: s_or_b64 exec, exec, s[8:9] 3127; SI-NEXT: s_waitcnt expcnt(0) 3128; SI-NEXT: s_setpc_b64 s[30:31] 3129; 3130; VI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: 3131; VI: ; %bb.0: 3132; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3133; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 3134; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3135; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3136; VI-NEXT: s_mov_b64 s[4:5], 0 3137; VI-NEXT: .LBB59_1: ; %atomicrmw.start 3138; VI-NEXT: ; =>This Inner Loop Header: Depth=1 3139; VI-NEXT: s_waitcnt vmcnt(0) 3140; VI-NEXT: v_mov_b32_e32 v9, v1 3141; VI-NEXT: v_mov_b32_e32 v8, v0 3142; VI-NEXT: v_and_b32_e32 v0, v9, v3 3143; VI-NEXT: v_and_b32_e32 v1, v8, v2 3144; VI-NEXT: v_not_b32_e32 v7, v0 3145; VI-NEXT: v_not_b32_e32 v6, v1 3146; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3147; VI-NEXT: s_waitcnt vmcnt(0) 3148; VI-NEXT: buffer_wbinvl1_vol 3149; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3150; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3151; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 3152; VI-NEXT: s_cbranch_execnz .LBB59_1 3153; VI-NEXT: ; %bb.2: ; %atomicrmw.end 3154; VI-NEXT: s_or_b64 exec, exec, s[4:5] 3155; VI-NEXT: s_setpc_b64 s[30:31] 3156; 3157; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: 3158; GFX9: ; %bb.0: 3159; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3160; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 3161; GFX9-NEXT: s_mov_b64 s[4:5], 0 3162; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start 3163; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3164; GFX9-NEXT: s_waitcnt vmcnt(0) 3165; GFX9-NEXT: v_mov_b32_e32 v7, v5 3166; GFX9-NEXT: v_mov_b32_e32 v6, v4 3167; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 3168; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 3169; GFX9-NEXT: v_not_b32_e32 v5, v4 3170; GFX9-NEXT: v_not_b32_e32 v4, v8 3171; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 3172; GFX9-NEXT: s_waitcnt vmcnt(0) 3173; GFX9-NEXT: buffer_wbinvl1_vol 3174; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3175; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3176; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 3177; GFX9-NEXT: s_cbranch_execnz .LBB59_1 3178; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 3179; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3180; GFX9-NEXT: v_mov_b32_e32 v0, v4 3181; GFX9-NEXT: v_mov_b32_e32 v1, v5 3182; GFX9-NEXT: s_setpc_b64 s[30:31] 3183 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3184 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 3185 ret i64 %result 3186} 3187 3188; --------------------------------------------------------------------- 3189; atomicrmw or 3190; --------------------------------------------------------------------- 3191 3192define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 3193; SI-LABEL: global_atomic_or_i64_noret: 3194; SI: ; %bb.0: 3195; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3196; SI-NEXT: s_mov_b32 s6, 0 3197; SI-NEXT: s_mov_b32 s7, 0xf000 3198; SI-NEXT: s_mov_b32 s4, s6 3199; SI-NEXT: s_mov_b32 s5, s6 3200; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 3201; SI-NEXT: s_waitcnt vmcnt(0) 3202; SI-NEXT: buffer_wbinvl1 3203; SI-NEXT: s_waitcnt expcnt(0) 3204; SI-NEXT: s_setpc_b64 s[30:31] 3205; 3206; VI-LABEL: global_atomic_or_i64_noret: 3207; VI: ; %bb.0: 3208; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3209; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] 3210; VI-NEXT: s_waitcnt vmcnt(0) 3211; VI-NEXT: buffer_wbinvl1_vol 3212; VI-NEXT: s_setpc_b64 s[30:31] 3213; 3214; GFX9-LABEL: global_atomic_or_i64_noret: 3215; GFX9: ; %bb.0: 3216; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3217; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off 3218; GFX9-NEXT: s_waitcnt vmcnt(0) 3219; GFX9-NEXT: buffer_wbinvl1_vol 3220; GFX9-NEXT: s_setpc_b64 s[30:31] 3221 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst 3222 ret void 3223} 3224 3225define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 3226; SI-LABEL: global_atomic_or_i64_noret_offset: 3227; SI: ; %bb.0: 3228; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3229; SI-NEXT: s_mov_b32 s6, 0 3230; SI-NEXT: s_mov_b32 s7, 0xf000 3231; SI-NEXT: s_mov_b32 s4, s6 3232; SI-NEXT: s_mov_b32 s5, s6 3233; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 3234; SI-NEXT: s_waitcnt vmcnt(0) 3235; SI-NEXT: buffer_wbinvl1 3236; SI-NEXT: s_waitcnt expcnt(0) 3237; SI-NEXT: s_setpc_b64 s[30:31] 3238; 3239; VI-LABEL: global_atomic_or_i64_noret_offset: 3240; VI: ; %bb.0: 3241; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3242; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3243; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3244; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] 3245; VI-NEXT: s_waitcnt vmcnt(0) 3246; VI-NEXT: buffer_wbinvl1_vol 3247; VI-NEXT: s_setpc_b64 s[30:31] 3248; 3249; GFX9-LABEL: global_atomic_or_i64_noret_offset: 3250; GFX9: ; %bb.0: 3251; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3252; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 3253; GFX9-NEXT: s_waitcnt vmcnt(0) 3254; GFX9-NEXT: buffer_wbinvl1_vol 3255; GFX9-NEXT: s_setpc_b64 s[30:31] 3256 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3257 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst 3258 ret void 3259} 3260 3261define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 3262; SI-LABEL: global_atomic_or_i64_ret: 3263; SI: ; %bb.0: 3264; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3265; SI-NEXT: s_mov_b32 s6, 0 3266; SI-NEXT: s_mov_b32 s7, 0xf000 3267; SI-NEXT: s_mov_b32 s4, s6 3268; SI-NEXT: s_mov_b32 s5, s6 3269; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 3270; SI-NEXT: s_waitcnt vmcnt(0) 3271; SI-NEXT: buffer_wbinvl1 3272; SI-NEXT: v_mov_b32_e32 v0, v2 3273; SI-NEXT: v_mov_b32_e32 v1, v3 3274; SI-NEXT: s_waitcnt expcnt(0) 3275; SI-NEXT: s_setpc_b64 s[30:31] 3276; 3277; VI-LABEL: global_atomic_or_i64_ret: 3278; VI: ; %bb.0: 3279; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3280; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc 3281; VI-NEXT: s_waitcnt vmcnt(0) 3282; VI-NEXT: buffer_wbinvl1_vol 3283; VI-NEXT: s_setpc_b64 s[30:31] 3284; 3285; GFX9-LABEL: global_atomic_or_i64_ret: 3286; GFX9: ; %bb.0: 3287; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3288; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc 3289; GFX9-NEXT: s_waitcnt vmcnt(0) 3290; GFX9-NEXT: buffer_wbinvl1_vol 3291; GFX9-NEXT: s_setpc_b64 s[30:31] 3292 %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst 3293 ret i64 %result 3294} 3295 3296define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 3297; SI-LABEL: global_atomic_or_i64_ret_offset: 3298; SI: ; %bb.0: 3299; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3300; SI-NEXT: s_mov_b32 s6, 0 3301; SI-NEXT: s_mov_b32 s7, 0xf000 3302; SI-NEXT: s_mov_b32 s4, s6 3303; SI-NEXT: s_mov_b32 s5, s6 3304; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 3305; SI-NEXT: s_waitcnt vmcnt(0) 3306; SI-NEXT: buffer_wbinvl1 3307; SI-NEXT: v_mov_b32_e32 v0, v2 3308; SI-NEXT: v_mov_b32_e32 v1, v3 3309; SI-NEXT: s_waitcnt expcnt(0) 3310; SI-NEXT: s_setpc_b64 s[30:31] 3311; 3312; VI-LABEL: global_atomic_or_i64_ret_offset: 3313; VI: ; %bb.0: 3314; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3315; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3316; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3317; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc 3318; VI-NEXT: s_waitcnt vmcnt(0) 3319; VI-NEXT: buffer_wbinvl1_vol 3320; VI-NEXT: s_setpc_b64 s[30:31] 3321; 3322; GFX9-LABEL: global_atomic_or_i64_ret_offset: 3323; GFX9: ; %bb.0: 3324; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3325; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 3326; GFX9-NEXT: s_waitcnt vmcnt(0) 3327; GFX9-NEXT: buffer_wbinvl1_vol 3328; GFX9-NEXT: s_setpc_b64 s[30:31] 3329 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3330 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst 3331 ret i64 %result 3332} 3333 3334define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 3335; SI-LABEL: global_atomic_or_i64_noret_scalar: 3336; SI: ; %bb.0: 3337; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3338; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3339; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3340; SI-NEXT: s_mov_b64 exec, s[34:35] 3341; SI-NEXT: s_waitcnt expcnt(0) 3342; SI-NEXT: v_writelane_b32 v2, s6, 0 3343; SI-NEXT: v_writelane_b32 v2, s7, 1 3344; SI-NEXT: s_mov_b32 s34, s7 3345; SI-NEXT: s_mov_b32 s35, s6 3346; SI-NEXT: s_mov_b32 s7, 0xf000 3347; SI-NEXT: s_mov_b32 s6, -1 3348; SI-NEXT: v_mov_b32_e32 v0, s35 3349; SI-NEXT: v_mov_b32_e32 v1, s34 3350; SI-NEXT: s_waitcnt vmcnt(0) 3351; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 3352; SI-NEXT: s_waitcnt vmcnt(0) 3353; SI-NEXT: buffer_wbinvl1 3354; SI-NEXT: v_readlane_b32 s7, v2, 1 3355; SI-NEXT: v_readlane_b32 s6, v2, 0 3356; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3357; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3358; SI-NEXT: s_mov_b64 exec, s[34:35] 3359; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3360; SI-NEXT: s_setpc_b64 s[30:31] 3361; 3362; VI-LABEL: global_atomic_or_i64_noret_scalar: 3363; VI: ; %bb.0: 3364; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3365; VI-NEXT: v_mov_b32_e32 v0, s6 3366; VI-NEXT: v_mov_b32_e32 v1, s7 3367; VI-NEXT: v_mov_b32_e32 v2, s4 3368; VI-NEXT: v_mov_b32_e32 v3, s5 3369; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] 3370; VI-NEXT: s_waitcnt vmcnt(0) 3371; VI-NEXT: buffer_wbinvl1_vol 3372; VI-NEXT: s_setpc_b64 s[30:31] 3373; 3374; GFX9-LABEL: global_atomic_or_i64_noret_scalar: 3375; GFX9: ; %bb.0: 3376; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3377; GFX9-NEXT: v_mov_b32_e32 v0, s6 3378; GFX9-NEXT: v_mov_b32_e32 v1, s7 3379; GFX9-NEXT: v_mov_b32_e32 v2, 0 3380; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] 3381; GFX9-NEXT: s_waitcnt vmcnt(0) 3382; GFX9-NEXT: buffer_wbinvl1_vol 3383; GFX9-NEXT: s_setpc_b64 s[30:31] 3384 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst 3385 ret void 3386} 3387 3388define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 3389; SI-LABEL: global_atomic_or_i64_noret_offset_scalar: 3390; SI: ; %bb.0: 3391; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3392; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3393; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3394; SI-NEXT: s_mov_b64 exec, s[34:35] 3395; SI-NEXT: s_waitcnt expcnt(0) 3396; SI-NEXT: v_writelane_b32 v2, s6, 0 3397; SI-NEXT: v_writelane_b32 v2, s7, 1 3398; SI-NEXT: v_mov_b32_e32 v0, s6 3399; SI-NEXT: v_mov_b32_e32 v1, s7 3400; SI-NEXT: s_mov_b32 s7, 0xf000 3401; SI-NEXT: s_mov_b32 s6, -1 3402; SI-NEXT: s_waitcnt vmcnt(0) 3403; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 3404; SI-NEXT: s_waitcnt vmcnt(0) 3405; SI-NEXT: buffer_wbinvl1 3406; SI-NEXT: v_readlane_b32 s7, v2, 1 3407; SI-NEXT: v_readlane_b32 s6, v2, 0 3408; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3409; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3410; SI-NEXT: s_mov_b64 exec, s[34:35] 3411; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3412; SI-NEXT: s_setpc_b64 s[30:31] 3413; 3414; VI-LABEL: global_atomic_or_i64_noret_offset_scalar: 3415; VI: ; %bb.0: 3416; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3417; VI-NEXT: s_add_u32 s34, s4, 32 3418; VI-NEXT: s_addc_u32 s35, s5, 0 3419; VI-NEXT: v_mov_b32_e32 v2, s34 3420; VI-NEXT: v_mov_b32_e32 v0, s6 3421; VI-NEXT: v_mov_b32_e32 v1, s7 3422; VI-NEXT: v_mov_b32_e32 v3, s35 3423; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] 3424; VI-NEXT: s_waitcnt vmcnt(0) 3425; VI-NEXT: buffer_wbinvl1_vol 3426; VI-NEXT: s_setpc_b64 s[30:31] 3427; 3428; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar: 3429; GFX9: ; %bb.0: 3430; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3431; GFX9-NEXT: v_mov_b32_e32 v0, s6 3432; GFX9-NEXT: v_mov_b32_e32 v1, s7 3433; GFX9-NEXT: v_mov_b32_e32 v2, 0 3434; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 3435; GFX9-NEXT: s_waitcnt vmcnt(0) 3436; GFX9-NEXT: buffer_wbinvl1_vol 3437; GFX9-NEXT: s_setpc_b64 s[30:31] 3438 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3439 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst 3440 ret void 3441} 3442 3443define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 3444; SI-LABEL: global_atomic_or_i64_ret_scalar: 3445; SI: ; %bb.0: 3446; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3447; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3448; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3449; SI-NEXT: s_mov_b64 exec, s[34:35] 3450; SI-NEXT: s_waitcnt expcnt(0) 3451; SI-NEXT: v_writelane_b32 v2, s6, 0 3452; SI-NEXT: v_writelane_b32 v2, s7, 1 3453; SI-NEXT: s_mov_b32 s34, s7 3454; SI-NEXT: s_mov_b32 s35, s6 3455; SI-NEXT: s_mov_b32 s7, 0xf000 3456; SI-NEXT: s_mov_b32 s6, -1 3457; SI-NEXT: v_mov_b32_e32 v0, s35 3458; SI-NEXT: v_mov_b32_e32 v1, s34 3459; SI-NEXT: s_waitcnt vmcnt(0) 3460; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc 3461; SI-NEXT: s_waitcnt vmcnt(0) 3462; SI-NEXT: buffer_wbinvl1 3463; SI-NEXT: v_readlane_b32 s7, v2, 1 3464; SI-NEXT: v_readlane_b32 s6, v2, 0 3465; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3466; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3467; SI-NEXT: s_mov_b64 exec, s[34:35] 3468; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3469; SI-NEXT: s_setpc_b64 s[30:31] 3470; 3471; VI-LABEL: global_atomic_or_i64_ret_scalar: 3472; VI: ; %bb.0: 3473; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3474; VI-NEXT: v_mov_b32_e32 v0, s6 3475; VI-NEXT: v_mov_b32_e32 v1, s7 3476; VI-NEXT: v_mov_b32_e32 v2, s4 3477; VI-NEXT: v_mov_b32_e32 v3, s5 3478; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc 3479; VI-NEXT: s_waitcnt vmcnt(0) 3480; VI-NEXT: buffer_wbinvl1_vol 3481; VI-NEXT: s_setpc_b64 s[30:31] 3482; 3483; GFX9-LABEL: global_atomic_or_i64_ret_scalar: 3484; GFX9: ; %bb.0: 3485; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3486; GFX9-NEXT: v_mov_b32_e32 v0, s6 3487; GFX9-NEXT: v_mov_b32_e32 v1, s7 3488; GFX9-NEXT: v_mov_b32_e32 v2, 0 3489; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc 3490; GFX9-NEXT: s_waitcnt vmcnt(0) 3491; GFX9-NEXT: buffer_wbinvl1_vol 3492; GFX9-NEXT: s_setpc_b64 s[30:31] 3493 %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst 3494 ret i64 %result 3495} 3496 3497define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 3498; SI-LABEL: global_atomic_or_i64_ret_offset_scalar: 3499; SI: ; %bb.0: 3500; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3501; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3502; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3503; SI-NEXT: s_mov_b64 exec, s[34:35] 3504; SI-NEXT: s_waitcnt expcnt(0) 3505; SI-NEXT: v_writelane_b32 v2, s6, 0 3506; SI-NEXT: v_writelane_b32 v2, s7, 1 3507; SI-NEXT: v_mov_b32_e32 v0, s6 3508; SI-NEXT: v_mov_b32_e32 v1, s7 3509; SI-NEXT: s_mov_b32 s7, 0xf000 3510; SI-NEXT: s_mov_b32 s6, -1 3511; SI-NEXT: s_waitcnt vmcnt(0) 3512; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc 3513; SI-NEXT: s_waitcnt vmcnt(0) 3514; SI-NEXT: buffer_wbinvl1 3515; SI-NEXT: v_readlane_b32 s7, v2, 1 3516; SI-NEXT: v_readlane_b32 s6, v2, 0 3517; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3518; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3519; SI-NEXT: s_mov_b64 exec, s[34:35] 3520; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3521; SI-NEXT: s_setpc_b64 s[30:31] 3522; 3523; VI-LABEL: global_atomic_or_i64_ret_offset_scalar: 3524; VI: ; %bb.0: 3525; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3526; VI-NEXT: s_add_u32 s34, s4, 32 3527; VI-NEXT: s_addc_u32 s35, s5, 0 3528; VI-NEXT: v_mov_b32_e32 v2, s34 3529; VI-NEXT: v_mov_b32_e32 v0, s6 3530; VI-NEXT: v_mov_b32_e32 v1, s7 3531; VI-NEXT: v_mov_b32_e32 v3, s35 3532; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc 3533; VI-NEXT: s_waitcnt vmcnt(0) 3534; VI-NEXT: buffer_wbinvl1_vol 3535; VI-NEXT: s_setpc_b64 s[30:31] 3536; 3537; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar: 3538; GFX9: ; %bb.0: 3539; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3540; GFX9-NEXT: v_mov_b32_e32 v0, s6 3541; GFX9-NEXT: v_mov_b32_e32 v1, s7 3542; GFX9-NEXT: v_mov_b32_e32 v2, 0 3543; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 3544; GFX9-NEXT: s_waitcnt vmcnt(0) 3545; GFX9-NEXT: buffer_wbinvl1_vol 3546; GFX9-NEXT: s_setpc_b64 s[30:31] 3547 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3548 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst 3549 ret i64 %result 3550} 3551 3552define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 3553; SI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: 3554; SI: ; %bb.0: 3555; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3556; SI-NEXT: s_mov_b32 s6, 0 3557; SI-NEXT: s_mov_b32 s7, 0xf000 3558; SI-NEXT: s_mov_b32 s4, s6 3559; SI-NEXT: s_mov_b32 s5, s6 3560; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 3561; SI-NEXT: s_waitcnt vmcnt(0) 3562; SI-NEXT: buffer_wbinvl1 3563; SI-NEXT: s_waitcnt expcnt(0) 3564; SI-NEXT: s_setpc_b64 s[30:31] 3565; 3566; VI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: 3567; VI: ; %bb.0: 3568; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3569; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3570; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3571; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] 3572; VI-NEXT: s_waitcnt vmcnt(0) 3573; VI-NEXT: buffer_wbinvl1_vol 3574; VI-NEXT: s_setpc_b64 s[30:31] 3575; 3576; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: 3577; GFX9: ; %bb.0: 3578; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3579; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 3580; GFX9-NEXT: s_waitcnt vmcnt(0) 3581; GFX9-NEXT: buffer_wbinvl1_vol 3582; GFX9-NEXT: s_setpc_b64 s[30:31] 3583 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3584 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 3585 ret void 3586} 3587 3588define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 3589; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: 3590; SI: ; %bb.0: 3591; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3592; SI-NEXT: s_mov_b32 s6, 0 3593; SI-NEXT: s_mov_b32 s7, 0xf000 3594; SI-NEXT: s_mov_b32 s4, s6 3595; SI-NEXT: s_mov_b32 s5, s6 3596; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 3597; SI-NEXT: s_waitcnt vmcnt(0) 3598; SI-NEXT: buffer_wbinvl1 3599; SI-NEXT: v_mov_b32_e32 v0, v2 3600; SI-NEXT: v_mov_b32_e32 v1, v3 3601; SI-NEXT: s_waitcnt expcnt(0) 3602; SI-NEXT: s_setpc_b64 s[30:31] 3603; 3604; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: 3605; VI: ; %bb.0: 3606; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3607; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3608; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3609; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc 3610; VI-NEXT: s_waitcnt vmcnt(0) 3611; VI-NEXT: buffer_wbinvl1_vol 3612; VI-NEXT: s_setpc_b64 s[30:31] 3613; 3614; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: 3615; GFX9: ; %bb.0: 3616; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3617; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 3618; GFX9-NEXT: s_waitcnt vmcnt(0) 3619; GFX9-NEXT: buffer_wbinvl1_vol 3620; GFX9-NEXT: s_setpc_b64 s[30:31] 3621 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3622 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 3623 ret i64 %result 3624} 3625 3626; --------------------------------------------------------------------- 3627; atomicrmw xor 3628; --------------------------------------------------------------------- 3629 3630define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 3631; SI-LABEL: global_atomic_xor_i64_noret: 3632; SI: ; %bb.0: 3633; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3634; SI-NEXT: s_mov_b32 s6, 0 3635; SI-NEXT: s_mov_b32 s7, 0xf000 3636; SI-NEXT: s_mov_b32 s4, s6 3637; SI-NEXT: s_mov_b32 s5, s6 3638; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 3639; SI-NEXT: s_waitcnt vmcnt(0) 3640; SI-NEXT: buffer_wbinvl1 3641; SI-NEXT: s_waitcnt expcnt(0) 3642; SI-NEXT: s_setpc_b64 s[30:31] 3643; 3644; VI-LABEL: global_atomic_xor_i64_noret: 3645; VI: ; %bb.0: 3646; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3647; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] 3648; VI-NEXT: s_waitcnt vmcnt(0) 3649; VI-NEXT: buffer_wbinvl1_vol 3650; VI-NEXT: s_setpc_b64 s[30:31] 3651; 3652; GFX9-LABEL: global_atomic_xor_i64_noret: 3653; GFX9: ; %bb.0: 3654; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3655; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off 3656; GFX9-NEXT: s_waitcnt vmcnt(0) 3657; GFX9-NEXT: buffer_wbinvl1_vol 3658; GFX9-NEXT: s_setpc_b64 s[30:31] 3659 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst 3660 ret void 3661} 3662 3663define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 3664; SI-LABEL: global_atomic_xor_i64_noret_offset: 3665; SI: ; %bb.0: 3666; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3667; SI-NEXT: s_mov_b32 s6, 0 3668; SI-NEXT: s_mov_b32 s7, 0xf000 3669; SI-NEXT: s_mov_b32 s4, s6 3670; SI-NEXT: s_mov_b32 s5, s6 3671; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 3672; SI-NEXT: s_waitcnt vmcnt(0) 3673; SI-NEXT: buffer_wbinvl1 3674; SI-NEXT: s_waitcnt expcnt(0) 3675; SI-NEXT: s_setpc_b64 s[30:31] 3676; 3677; VI-LABEL: global_atomic_xor_i64_noret_offset: 3678; VI: ; %bb.0: 3679; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3680; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3681; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3682; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] 3683; VI-NEXT: s_waitcnt vmcnt(0) 3684; VI-NEXT: buffer_wbinvl1_vol 3685; VI-NEXT: s_setpc_b64 s[30:31] 3686; 3687; GFX9-LABEL: global_atomic_xor_i64_noret_offset: 3688; GFX9: ; %bb.0: 3689; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3690; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 3691; GFX9-NEXT: s_waitcnt vmcnt(0) 3692; GFX9-NEXT: buffer_wbinvl1_vol 3693; GFX9-NEXT: s_setpc_b64 s[30:31] 3694 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3695 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst 3696 ret void 3697} 3698 3699define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 3700; SI-LABEL: global_atomic_xor_i64_ret: 3701; SI: ; %bb.0: 3702; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3703; SI-NEXT: s_mov_b32 s6, 0 3704; SI-NEXT: s_mov_b32 s7, 0xf000 3705; SI-NEXT: s_mov_b32 s4, s6 3706; SI-NEXT: s_mov_b32 s5, s6 3707; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 3708; SI-NEXT: s_waitcnt vmcnt(0) 3709; SI-NEXT: buffer_wbinvl1 3710; SI-NEXT: v_mov_b32_e32 v0, v2 3711; SI-NEXT: v_mov_b32_e32 v1, v3 3712; SI-NEXT: s_waitcnt expcnt(0) 3713; SI-NEXT: s_setpc_b64 s[30:31] 3714; 3715; VI-LABEL: global_atomic_xor_i64_ret: 3716; VI: ; %bb.0: 3717; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3718; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc 3719; VI-NEXT: s_waitcnt vmcnt(0) 3720; VI-NEXT: buffer_wbinvl1_vol 3721; VI-NEXT: s_setpc_b64 s[30:31] 3722; 3723; GFX9-LABEL: global_atomic_xor_i64_ret: 3724; GFX9: ; %bb.0: 3725; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3726; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc 3727; GFX9-NEXT: s_waitcnt vmcnt(0) 3728; GFX9-NEXT: buffer_wbinvl1_vol 3729; GFX9-NEXT: s_setpc_b64 s[30:31] 3730 %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst 3731 ret i64 %result 3732} 3733 3734define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 3735; SI-LABEL: global_atomic_xor_i64_ret_offset: 3736; SI: ; %bb.0: 3737; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3738; SI-NEXT: s_mov_b32 s6, 0 3739; SI-NEXT: s_mov_b32 s7, 0xf000 3740; SI-NEXT: s_mov_b32 s4, s6 3741; SI-NEXT: s_mov_b32 s5, s6 3742; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 3743; SI-NEXT: s_waitcnt vmcnt(0) 3744; SI-NEXT: buffer_wbinvl1 3745; SI-NEXT: v_mov_b32_e32 v0, v2 3746; SI-NEXT: v_mov_b32_e32 v1, v3 3747; SI-NEXT: s_waitcnt expcnt(0) 3748; SI-NEXT: s_setpc_b64 s[30:31] 3749; 3750; VI-LABEL: global_atomic_xor_i64_ret_offset: 3751; VI: ; %bb.0: 3752; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3753; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 3754; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3755; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc 3756; VI-NEXT: s_waitcnt vmcnt(0) 3757; VI-NEXT: buffer_wbinvl1_vol 3758; VI-NEXT: s_setpc_b64 s[30:31] 3759; 3760; GFX9-LABEL: global_atomic_xor_i64_ret_offset: 3761; GFX9: ; %bb.0: 3762; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3763; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 3764; GFX9-NEXT: s_waitcnt vmcnt(0) 3765; GFX9-NEXT: buffer_wbinvl1_vol 3766; GFX9-NEXT: s_setpc_b64 s[30:31] 3767 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3768 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst 3769 ret i64 %result 3770} 3771 3772define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 3773; SI-LABEL: global_atomic_xor_i64_noret_scalar: 3774; SI: ; %bb.0: 3775; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3776; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3777; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3778; SI-NEXT: s_mov_b64 exec, s[34:35] 3779; SI-NEXT: s_waitcnt expcnt(0) 3780; SI-NEXT: v_writelane_b32 v2, s6, 0 3781; SI-NEXT: v_writelane_b32 v2, s7, 1 3782; SI-NEXT: s_mov_b32 s34, s7 3783; SI-NEXT: s_mov_b32 s35, s6 3784; SI-NEXT: s_mov_b32 s7, 0xf000 3785; SI-NEXT: s_mov_b32 s6, -1 3786; SI-NEXT: v_mov_b32_e32 v0, s35 3787; SI-NEXT: v_mov_b32_e32 v1, s34 3788; SI-NEXT: s_waitcnt vmcnt(0) 3789; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 3790; SI-NEXT: s_waitcnt vmcnt(0) 3791; SI-NEXT: buffer_wbinvl1 3792; SI-NEXT: v_readlane_b32 s7, v2, 1 3793; SI-NEXT: v_readlane_b32 s6, v2, 0 3794; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3795; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3796; SI-NEXT: s_mov_b64 exec, s[34:35] 3797; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3798; SI-NEXT: s_setpc_b64 s[30:31] 3799; 3800; VI-LABEL: global_atomic_xor_i64_noret_scalar: 3801; VI: ; %bb.0: 3802; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3803; VI-NEXT: v_mov_b32_e32 v0, s6 3804; VI-NEXT: v_mov_b32_e32 v1, s7 3805; VI-NEXT: v_mov_b32_e32 v2, s4 3806; VI-NEXT: v_mov_b32_e32 v3, s5 3807; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] 3808; VI-NEXT: s_waitcnt vmcnt(0) 3809; VI-NEXT: buffer_wbinvl1_vol 3810; VI-NEXT: s_setpc_b64 s[30:31] 3811; 3812; GFX9-LABEL: global_atomic_xor_i64_noret_scalar: 3813; GFX9: ; %bb.0: 3814; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3815; GFX9-NEXT: v_mov_b32_e32 v0, s6 3816; GFX9-NEXT: v_mov_b32_e32 v1, s7 3817; GFX9-NEXT: v_mov_b32_e32 v2, 0 3818; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] 3819; GFX9-NEXT: s_waitcnt vmcnt(0) 3820; GFX9-NEXT: buffer_wbinvl1_vol 3821; GFX9-NEXT: s_setpc_b64 s[30:31] 3822 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst 3823 ret void 3824} 3825 3826define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 3827; SI-LABEL: global_atomic_xor_i64_noret_offset_scalar: 3828; SI: ; %bb.0: 3829; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3830; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3831; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3832; SI-NEXT: s_mov_b64 exec, s[34:35] 3833; SI-NEXT: s_waitcnt expcnt(0) 3834; SI-NEXT: v_writelane_b32 v2, s6, 0 3835; SI-NEXT: v_writelane_b32 v2, s7, 1 3836; SI-NEXT: v_mov_b32_e32 v0, s6 3837; SI-NEXT: v_mov_b32_e32 v1, s7 3838; SI-NEXT: s_mov_b32 s7, 0xf000 3839; SI-NEXT: s_mov_b32 s6, -1 3840; SI-NEXT: s_waitcnt vmcnt(0) 3841; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 3842; SI-NEXT: s_waitcnt vmcnt(0) 3843; SI-NEXT: buffer_wbinvl1 3844; SI-NEXT: v_readlane_b32 s7, v2, 1 3845; SI-NEXT: v_readlane_b32 s6, v2, 0 3846; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3847; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3848; SI-NEXT: s_mov_b64 exec, s[34:35] 3849; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3850; SI-NEXT: s_setpc_b64 s[30:31] 3851; 3852; VI-LABEL: global_atomic_xor_i64_noret_offset_scalar: 3853; VI: ; %bb.0: 3854; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3855; VI-NEXT: s_add_u32 s34, s4, 32 3856; VI-NEXT: s_addc_u32 s35, s5, 0 3857; VI-NEXT: v_mov_b32_e32 v2, s34 3858; VI-NEXT: v_mov_b32_e32 v0, s6 3859; VI-NEXT: v_mov_b32_e32 v1, s7 3860; VI-NEXT: v_mov_b32_e32 v3, s35 3861; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] 3862; VI-NEXT: s_waitcnt vmcnt(0) 3863; VI-NEXT: buffer_wbinvl1_vol 3864; VI-NEXT: s_setpc_b64 s[30:31] 3865; 3866; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar: 3867; GFX9: ; %bb.0: 3868; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3869; GFX9-NEXT: v_mov_b32_e32 v0, s6 3870; GFX9-NEXT: v_mov_b32_e32 v1, s7 3871; GFX9-NEXT: v_mov_b32_e32 v2, 0 3872; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 3873; GFX9-NEXT: s_waitcnt vmcnt(0) 3874; GFX9-NEXT: buffer_wbinvl1_vol 3875; GFX9-NEXT: s_setpc_b64 s[30:31] 3876 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3877 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst 3878 ret void 3879} 3880 3881define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 3882; SI-LABEL: global_atomic_xor_i64_ret_scalar: 3883; SI: ; %bb.0: 3884; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3885; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3886; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3887; SI-NEXT: s_mov_b64 exec, s[34:35] 3888; SI-NEXT: s_waitcnt expcnt(0) 3889; SI-NEXT: v_writelane_b32 v2, s6, 0 3890; SI-NEXT: v_writelane_b32 v2, s7, 1 3891; SI-NEXT: s_mov_b32 s34, s7 3892; SI-NEXT: s_mov_b32 s35, s6 3893; SI-NEXT: s_mov_b32 s7, 0xf000 3894; SI-NEXT: s_mov_b32 s6, -1 3895; SI-NEXT: v_mov_b32_e32 v0, s35 3896; SI-NEXT: v_mov_b32_e32 v1, s34 3897; SI-NEXT: s_waitcnt vmcnt(0) 3898; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc 3899; SI-NEXT: s_waitcnt vmcnt(0) 3900; SI-NEXT: buffer_wbinvl1 3901; SI-NEXT: v_readlane_b32 s7, v2, 1 3902; SI-NEXT: v_readlane_b32 s6, v2, 0 3903; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3904; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3905; SI-NEXT: s_mov_b64 exec, s[34:35] 3906; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3907; SI-NEXT: s_setpc_b64 s[30:31] 3908; 3909; VI-LABEL: global_atomic_xor_i64_ret_scalar: 3910; VI: ; %bb.0: 3911; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3912; VI-NEXT: v_mov_b32_e32 v0, s6 3913; VI-NEXT: v_mov_b32_e32 v1, s7 3914; VI-NEXT: v_mov_b32_e32 v2, s4 3915; VI-NEXT: v_mov_b32_e32 v3, s5 3916; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc 3917; VI-NEXT: s_waitcnt vmcnt(0) 3918; VI-NEXT: buffer_wbinvl1_vol 3919; VI-NEXT: s_setpc_b64 s[30:31] 3920; 3921; GFX9-LABEL: global_atomic_xor_i64_ret_scalar: 3922; GFX9: ; %bb.0: 3923; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3924; GFX9-NEXT: v_mov_b32_e32 v0, s6 3925; GFX9-NEXT: v_mov_b32_e32 v1, s7 3926; GFX9-NEXT: v_mov_b32_e32 v2, 0 3927; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc 3928; GFX9-NEXT: s_waitcnt vmcnt(0) 3929; GFX9-NEXT: buffer_wbinvl1_vol 3930; GFX9-NEXT: s_setpc_b64 s[30:31] 3931 %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst 3932 ret i64 %result 3933} 3934 3935define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 3936; SI-LABEL: global_atomic_xor_i64_ret_offset_scalar: 3937; SI: ; %bb.0: 3938; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3939; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3940; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 3941; SI-NEXT: s_mov_b64 exec, s[34:35] 3942; SI-NEXT: s_waitcnt expcnt(0) 3943; SI-NEXT: v_writelane_b32 v2, s6, 0 3944; SI-NEXT: v_writelane_b32 v2, s7, 1 3945; SI-NEXT: v_mov_b32_e32 v0, s6 3946; SI-NEXT: v_mov_b32_e32 v1, s7 3947; SI-NEXT: s_mov_b32 s7, 0xf000 3948; SI-NEXT: s_mov_b32 s6, -1 3949; SI-NEXT: s_waitcnt vmcnt(0) 3950; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc 3951; SI-NEXT: s_waitcnt vmcnt(0) 3952; SI-NEXT: buffer_wbinvl1 3953; SI-NEXT: v_readlane_b32 s7, v2, 1 3954; SI-NEXT: v_readlane_b32 s6, v2, 0 3955; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 3956; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 3957; SI-NEXT: s_mov_b64 exec, s[34:35] 3958; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3959; SI-NEXT: s_setpc_b64 s[30:31] 3960; 3961; VI-LABEL: global_atomic_xor_i64_ret_offset_scalar: 3962; VI: ; %bb.0: 3963; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3964; VI-NEXT: s_add_u32 s34, s4, 32 3965; VI-NEXT: s_addc_u32 s35, s5, 0 3966; VI-NEXT: v_mov_b32_e32 v2, s34 3967; VI-NEXT: v_mov_b32_e32 v0, s6 3968; VI-NEXT: v_mov_b32_e32 v1, s7 3969; VI-NEXT: v_mov_b32_e32 v3, s35 3970; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc 3971; VI-NEXT: s_waitcnt vmcnt(0) 3972; VI-NEXT: buffer_wbinvl1_vol 3973; VI-NEXT: s_setpc_b64 s[30:31] 3974; 3975; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar: 3976; GFX9: ; %bb.0: 3977; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3978; GFX9-NEXT: v_mov_b32_e32 v0, s6 3979; GFX9-NEXT: v_mov_b32_e32 v1, s7 3980; GFX9-NEXT: v_mov_b32_e32 v2, 0 3981; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 3982; GFX9-NEXT: s_waitcnt vmcnt(0) 3983; GFX9-NEXT: buffer_wbinvl1_vol 3984; GFX9-NEXT: s_setpc_b64 s[30:31] 3985 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 3986 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst 3987 ret i64 %result 3988} 3989 3990define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 3991; SI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: 3992; SI: ; %bb.0: 3993; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3994; SI-NEXT: s_mov_b32 s6, 0 3995; SI-NEXT: s_mov_b32 s7, 0xf000 3996; SI-NEXT: s_mov_b32 s4, s6 3997; SI-NEXT: s_mov_b32 s5, s6 3998; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 3999; SI-NEXT: s_waitcnt vmcnt(0) 4000; SI-NEXT: buffer_wbinvl1 4001; SI-NEXT: s_waitcnt expcnt(0) 4002; SI-NEXT: s_setpc_b64 s[30:31] 4003; 4004; VI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: 4005; VI: ; %bb.0: 4006; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4007; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 4008; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4009; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] 4010; VI-NEXT: s_waitcnt vmcnt(0) 4011; VI-NEXT: buffer_wbinvl1_vol 4012; VI-NEXT: s_setpc_b64 s[30:31] 4013; 4014; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: 4015; GFX9: ; %bb.0: 4016; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4017; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 4018; GFX9-NEXT: s_waitcnt vmcnt(0) 4019; GFX9-NEXT: buffer_wbinvl1_vol 4020; GFX9-NEXT: s_setpc_b64 s[30:31] 4021 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 4022 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 4023 ret void 4024} 4025 4026define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 4027; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: 4028; SI: ; %bb.0: 4029; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4030; SI-NEXT: s_mov_b32 s6, 0 4031; SI-NEXT: s_mov_b32 s7, 0xf000 4032; SI-NEXT: s_mov_b32 s4, s6 4033; SI-NEXT: s_mov_b32 s5, s6 4034; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 4035; SI-NEXT: s_waitcnt vmcnt(0) 4036; SI-NEXT: buffer_wbinvl1 4037; SI-NEXT: v_mov_b32_e32 v0, v2 4038; SI-NEXT: v_mov_b32_e32 v1, v3 4039; SI-NEXT: s_waitcnt expcnt(0) 4040; SI-NEXT: s_setpc_b64 s[30:31] 4041; 4042; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: 4043; VI: ; %bb.0: 4044; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4045; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 4046; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4047; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc 4048; VI-NEXT: s_waitcnt vmcnt(0) 4049; VI-NEXT: buffer_wbinvl1_vol 4050; VI-NEXT: s_setpc_b64 s[30:31] 4051; 4052; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: 4053; GFX9: ; %bb.0: 4054; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4055; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 4056; GFX9-NEXT: s_waitcnt vmcnt(0) 4057; GFX9-NEXT: buffer_wbinvl1_vol 4058; GFX9-NEXT: s_setpc_b64 s[30:31] 4059 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 4060 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 4061 ret i64 %result 4062} 4063 4064; --------------------------------------------------------------------- 4065; atomicrmw max 4066; --------------------------------------------------------------------- 4067 4068define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 4069; SI-LABEL: global_atomic_max_i64_noret: 4070; SI: ; %bb.0: 4071; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4072; SI-NEXT: s_mov_b32 s6, 0 4073; SI-NEXT: s_mov_b32 s7, 0xf000 4074; SI-NEXT: s_mov_b32 s4, s6 4075; SI-NEXT: s_mov_b32 s5, s6 4076; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 4077; SI-NEXT: s_mov_b64 s[8:9], 0 4078; SI-NEXT: .LBB80_1: ; %atomicrmw.start 4079; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4080; SI-NEXT: s_waitcnt vmcnt(0) 4081; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4082; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4083; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4084; SI-NEXT: s_waitcnt expcnt(0) 4085; SI-NEXT: v_mov_b32_e32 v11, v7 4086; SI-NEXT: v_mov_b32_e32 v10, v6 4087; SI-NEXT: v_mov_b32_e32 v9, v5 4088; SI-NEXT: v_mov_b32_e32 v8, v4 4089; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc 4090; SI-NEXT: s_waitcnt vmcnt(0) 4091; SI-NEXT: buffer_wbinvl1 4092; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 4093; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4094; SI-NEXT: v_mov_b32_e32 v6, v8 4095; SI-NEXT: v_mov_b32_e32 v7, v9 4096; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4097; SI-NEXT: s_cbranch_execnz .LBB80_1 4098; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4099; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4100; SI-NEXT: s_waitcnt expcnt(0) 4101; SI-NEXT: s_setpc_b64 s[30:31] 4102; 4103; VI-LABEL: global_atomic_max_i64_noret: 4104; VI: ; %bb.0: 4105; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4106; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 4107; VI-NEXT: s_mov_b64 s[4:5], 0 4108; VI-NEXT: .LBB80_1: ; %atomicrmw.start 4109; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4110; VI-NEXT: s_waitcnt vmcnt(0) 4111; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4112; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4113; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4114; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4115; VI-NEXT: s_waitcnt vmcnt(0) 4116; VI-NEXT: buffer_wbinvl1_vol 4117; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4118; VI-NEXT: v_mov_b32_e32 v7, v5 4119; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4120; VI-NEXT: v_mov_b32_e32 v6, v4 4121; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4122; VI-NEXT: s_cbranch_execnz .LBB80_1 4123; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4124; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4125; VI-NEXT: s_setpc_b64 s[30:31] 4126; 4127; GFX9-LABEL: global_atomic_max_i64_noret: 4128; GFX9: ; %bb.0: 4129; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4130; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 4131; GFX9-NEXT: s_mov_b64 s[4:5], 0 4132; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start 4133; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4134; GFX9-NEXT: s_waitcnt vmcnt(0) 4135; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4136; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4137; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4138; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 4139; GFX9-NEXT: s_waitcnt vmcnt(0) 4140; GFX9-NEXT: buffer_wbinvl1_vol 4141; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4142; GFX9-NEXT: v_mov_b32_e32 v7, v5 4143; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4144; GFX9-NEXT: v_mov_b32_e32 v6, v4 4145; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4146; GFX9-NEXT: s_cbranch_execnz .LBB80_1 4147; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4148; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4149; GFX9-NEXT: s_setpc_b64 s[30:31] 4150 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst 4151 ret void 4152} 4153 4154define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 4155; SI-LABEL: global_atomic_max_i64_noret_offset: 4156; SI: ; %bb.0: 4157; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4158; SI-NEXT: s_mov_b32 s6, 0 4159; SI-NEXT: s_mov_b32 s7, 0xf000 4160; SI-NEXT: s_mov_b32 s4, s6 4161; SI-NEXT: s_mov_b32 s5, s6 4162; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 4163; SI-NEXT: s_mov_b64 s[8:9], 0 4164; SI-NEXT: .LBB81_1: ; %atomicrmw.start 4165; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4166; SI-NEXT: s_waitcnt vmcnt(0) 4167; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4168; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4169; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4170; SI-NEXT: s_waitcnt expcnt(0) 4171; SI-NEXT: v_mov_b32_e32 v11, v7 4172; SI-NEXT: v_mov_b32_e32 v10, v6 4173; SI-NEXT: v_mov_b32_e32 v9, v5 4174; SI-NEXT: v_mov_b32_e32 v8, v4 4175; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 4176; SI-NEXT: s_waitcnt vmcnt(0) 4177; SI-NEXT: buffer_wbinvl1 4178; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 4179; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4180; SI-NEXT: v_mov_b32_e32 v6, v8 4181; SI-NEXT: v_mov_b32_e32 v7, v9 4182; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4183; SI-NEXT: s_cbranch_execnz .LBB81_1 4184; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4185; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4186; SI-NEXT: s_waitcnt expcnt(0) 4187; SI-NEXT: s_setpc_b64 s[30:31] 4188; 4189; VI-LABEL: global_atomic_max_i64_noret_offset: 4190; VI: ; %bb.0: 4191; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4192; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 4193; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4194; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 4195; VI-NEXT: s_mov_b64 s[4:5], 0 4196; VI-NEXT: .LBB81_1: ; %atomicrmw.start 4197; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4198; VI-NEXT: s_waitcnt vmcnt(0) 4199; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4200; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4201; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4202; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4203; VI-NEXT: s_waitcnt vmcnt(0) 4204; VI-NEXT: buffer_wbinvl1_vol 4205; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4206; VI-NEXT: v_mov_b32_e32 v7, v5 4207; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4208; VI-NEXT: v_mov_b32_e32 v6, v4 4209; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4210; VI-NEXT: s_cbranch_execnz .LBB81_1 4211; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4212; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4213; VI-NEXT: s_setpc_b64 s[30:31] 4214; 4215; GFX9-LABEL: global_atomic_max_i64_noret_offset: 4216; GFX9: ; %bb.0: 4217; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4218; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 4219; GFX9-NEXT: s_mov_b64 s[4:5], 0 4220; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start 4221; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4222; GFX9-NEXT: s_waitcnt vmcnt(0) 4223; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4224; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4225; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4226; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 4227; GFX9-NEXT: s_waitcnt vmcnt(0) 4228; GFX9-NEXT: buffer_wbinvl1_vol 4229; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4230; GFX9-NEXT: v_mov_b32_e32 v7, v5 4231; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4232; GFX9-NEXT: v_mov_b32_e32 v6, v4 4233; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4234; GFX9-NEXT: s_cbranch_execnz .LBB81_1 4235; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4236; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4237; GFX9-NEXT: s_setpc_b64 s[30:31] 4238 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 4239 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst 4240 ret void 4241} 4242 4243define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 4244; SI-LABEL: global_atomic_max_i64_ret: 4245; SI: ; %bb.0: 4246; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4247; SI-NEXT: v_mov_b32_e32 v5, v3 4248; SI-NEXT: v_mov_b32_e32 v4, v2 4249; SI-NEXT: v_mov_b32_e32 v7, v1 4250; SI-NEXT: v_mov_b32_e32 v6, v0 4251; SI-NEXT: s_mov_b32 s6, 0 4252; SI-NEXT: s_mov_b32 s7, 0xf000 4253; SI-NEXT: s_mov_b32 s4, s6 4254; SI-NEXT: s_mov_b32 s5, s6 4255; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 4256; SI-NEXT: s_mov_b64 s[8:9], 0 4257; SI-NEXT: .LBB82_1: ; %atomicrmw.start 4258; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4259; SI-NEXT: s_waitcnt vmcnt(0) 4260; SI-NEXT: v_mov_b32_e32 v11, v1 4261; SI-NEXT: v_mov_b32_e32 v10, v0 4262; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5] 4263; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 4264; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 4265; SI-NEXT: s_waitcnt expcnt(0) 4266; SI-NEXT: v_mov_b32_e32 v0, v8 4267; SI-NEXT: v_mov_b32_e32 v1, v9 4268; SI-NEXT: v_mov_b32_e32 v2, v10 4269; SI-NEXT: v_mov_b32_e32 v3, v11 4270; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc 4271; SI-NEXT: s_waitcnt vmcnt(0) 4272; SI-NEXT: buffer_wbinvl1 4273; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 4274; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4275; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4276; SI-NEXT: s_cbranch_execnz .LBB82_1 4277; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4278; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4279; SI-NEXT: s_waitcnt expcnt(0) 4280; SI-NEXT: s_setpc_b64 s[30:31] 4281; 4282; VI-LABEL: global_atomic_max_i64_ret: 4283; VI: ; %bb.0: 4284; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4285; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 4286; VI-NEXT: s_mov_b64 s[4:5], 0 4287; VI-NEXT: .LBB82_1: ; %atomicrmw.start 4288; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4289; VI-NEXT: s_waitcnt vmcnt(0) 4290; VI-NEXT: v_mov_b32_e32 v7, v5 4291; VI-NEXT: v_mov_b32_e32 v6, v4 4292; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4293; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4294; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4295; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4296; VI-NEXT: s_waitcnt vmcnt(0) 4297; VI-NEXT: buffer_wbinvl1_vol 4298; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4299; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4300; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4301; VI-NEXT: s_cbranch_execnz .LBB82_1 4302; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4303; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4304; VI-NEXT: v_mov_b32_e32 v0, v4 4305; VI-NEXT: v_mov_b32_e32 v1, v5 4306; VI-NEXT: s_setpc_b64 s[30:31] 4307; 4308; GFX9-LABEL: global_atomic_max_i64_ret: 4309; GFX9: ; %bb.0: 4310; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4311; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 4312; GFX9-NEXT: s_mov_b64 s[4:5], 0 4313; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start 4314; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4315; GFX9-NEXT: s_waitcnt vmcnt(0) 4316; GFX9-NEXT: v_mov_b32_e32 v7, v5 4317; GFX9-NEXT: v_mov_b32_e32 v6, v4 4318; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4319; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4320; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4321; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 4322; GFX9-NEXT: s_waitcnt vmcnt(0) 4323; GFX9-NEXT: buffer_wbinvl1_vol 4324; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4325; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4326; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4327; GFX9-NEXT: s_cbranch_execnz .LBB82_1 4328; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4329; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4330; GFX9-NEXT: v_mov_b32_e32 v0, v4 4331; GFX9-NEXT: v_mov_b32_e32 v1, v5 4332; GFX9-NEXT: s_setpc_b64 s[30:31] 4333 %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst 4334 ret i64 %result 4335} 4336 4337define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 4338; SI-LABEL: global_atomic_max_i64_ret_offset: 4339; SI: ; %bb.0: 4340; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4341; SI-NEXT: v_mov_b32_e32 v5, v3 4342; SI-NEXT: v_mov_b32_e32 v4, v2 4343; SI-NEXT: v_mov_b32_e32 v7, v1 4344; SI-NEXT: v_mov_b32_e32 v6, v0 4345; SI-NEXT: s_mov_b32 s6, 0 4346; SI-NEXT: s_mov_b32 s7, 0xf000 4347; SI-NEXT: s_mov_b32 s4, s6 4348; SI-NEXT: s_mov_b32 s5, s6 4349; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 4350; SI-NEXT: s_mov_b64 s[8:9], 0 4351; SI-NEXT: .LBB83_1: ; %atomicrmw.start 4352; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4353; SI-NEXT: s_waitcnt vmcnt(0) 4354; SI-NEXT: v_mov_b32_e32 v11, v1 4355; SI-NEXT: v_mov_b32_e32 v10, v0 4356; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5] 4357; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 4358; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 4359; SI-NEXT: s_waitcnt expcnt(0) 4360; SI-NEXT: v_mov_b32_e32 v0, v8 4361; SI-NEXT: v_mov_b32_e32 v1, v9 4362; SI-NEXT: v_mov_b32_e32 v2, v10 4363; SI-NEXT: v_mov_b32_e32 v3, v11 4364; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 4365; SI-NEXT: s_waitcnt vmcnt(0) 4366; SI-NEXT: buffer_wbinvl1 4367; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 4368; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 4369; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 4370; SI-NEXT: s_cbranch_execnz .LBB83_1 4371; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4372; SI-NEXT: s_or_b64 exec, exec, s[8:9] 4373; SI-NEXT: s_waitcnt expcnt(0) 4374; SI-NEXT: s_setpc_b64 s[30:31] 4375; 4376; VI-LABEL: global_atomic_max_i64_ret_offset: 4377; VI: ; %bb.0: 4378; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4379; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 4380; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 4381; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 4382; VI-NEXT: s_mov_b64 s[4:5], 0 4383; VI-NEXT: .LBB83_1: ; %atomicrmw.start 4384; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4385; VI-NEXT: s_waitcnt vmcnt(0) 4386; VI-NEXT: v_mov_b32_e32 v9, v1 4387; VI-NEXT: v_mov_b32_e32 v8, v0 4388; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] 4389; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 4390; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 4391; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4392; VI-NEXT: s_waitcnt vmcnt(0) 4393; VI-NEXT: buffer_wbinvl1_vol 4394; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4395; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4396; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4397; VI-NEXT: s_cbranch_execnz .LBB83_1 4398; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4399; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4400; VI-NEXT: s_setpc_b64 s[30:31] 4401; 4402; GFX9-LABEL: global_atomic_max_i64_ret_offset: 4403; GFX9: ; %bb.0: 4404; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4405; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 4406; GFX9-NEXT: s_mov_b64 s[4:5], 0 4407; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start 4408; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4409; GFX9-NEXT: s_waitcnt vmcnt(0) 4410; GFX9-NEXT: v_mov_b32_e32 v7, v5 4411; GFX9-NEXT: v_mov_b32_e32 v6, v4 4412; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 4413; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 4414; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 4415; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 4416; GFX9-NEXT: s_waitcnt vmcnt(0) 4417; GFX9-NEXT: buffer_wbinvl1_vol 4418; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4419; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4420; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4421; GFX9-NEXT: s_cbranch_execnz .LBB83_1 4422; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4423; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4424; GFX9-NEXT: v_mov_b32_e32 v0, v4 4425; GFX9-NEXT: v_mov_b32_e32 v1, v5 4426; GFX9-NEXT: s_setpc_b64 s[30:31] 4427 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 4428 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst 4429 ret i64 %result 4430} 4431 4432define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 4433; SI-LABEL: global_atomic_max_i64_noret_scalar: 4434; SI: ; %bb.0: 4435; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4436; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4437; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 4438; SI-NEXT: s_mov_b64 exec, s[34:35] 4439; SI-NEXT: s_waitcnt expcnt(0) 4440; SI-NEXT: v_writelane_b32 v10, s6, 0 4441; SI-NEXT: v_writelane_b32 v10, s7, 1 4442; SI-NEXT: s_mov_b32 s35, s7 4443; SI-NEXT: s_mov_b32 s34, s6 4444; SI-NEXT: s_mov_b32 s7, 0xf000 4445; SI-NEXT: s_mov_b32 s6, -1 4446; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 4447; SI-NEXT: s_mov_b64 s[36:37], 0 4448; SI-NEXT: v_mov_b32_e32 v4, s35 4449; SI-NEXT: v_mov_b32_e32 v5, s34 4450; SI-NEXT: .LBB84_1: ; %atomicrmw.start 4451; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4452; SI-NEXT: s_waitcnt vmcnt(0) 4453; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] 4454; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4455; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 4456; SI-NEXT: s_waitcnt expcnt(0) 4457; SI-NEXT: v_mov_b32_e32 v9, v3 4458; SI-NEXT: v_mov_b32_e32 v8, v2 4459; SI-NEXT: v_mov_b32_e32 v7, v1 4460; SI-NEXT: v_mov_b32_e32 v6, v0 4461; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc 4462; SI-NEXT: s_waitcnt vmcnt(0) 4463; SI-NEXT: buffer_wbinvl1 4464; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 4465; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4466; SI-NEXT: v_mov_b32_e32 v2, v6 4467; SI-NEXT: v_mov_b32_e32 v3, v7 4468; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4469; SI-NEXT: s_cbranch_execnz .LBB84_1 4470; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4471; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4472; SI-NEXT: v_readlane_b32 s7, v10, 1 4473; SI-NEXT: v_readlane_b32 s6, v10, 0 4474; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4475; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 4476; SI-NEXT: s_mov_b64 exec, s[34:35] 4477; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4478; SI-NEXT: s_setpc_b64 s[30:31] 4479; 4480; VI-LABEL: global_atomic_max_i64_noret_scalar: 4481; VI: ; %bb.0: 4482; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4483; VI-NEXT: v_mov_b32_e32 v0, s4 4484; VI-NEXT: v_mov_b32_e32 v1, s5 4485; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 4486; VI-NEXT: v_mov_b32_e32 v4, s4 4487; VI-NEXT: s_mov_b64 s[34:35], 0 4488; VI-NEXT: v_mov_b32_e32 v6, s7 4489; VI-NEXT: v_mov_b32_e32 v7, s6 4490; VI-NEXT: v_mov_b32_e32 v5, s5 4491; VI-NEXT: .LBB84_1: ; %atomicrmw.start 4492; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4493; VI-NEXT: s_waitcnt vmcnt(0) 4494; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] 4495; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 4496; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 4497; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 4498; VI-NEXT: s_waitcnt vmcnt(0) 4499; VI-NEXT: buffer_wbinvl1_vol 4500; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4501; VI-NEXT: v_mov_b32_e32 v3, v1 4502; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4503; VI-NEXT: v_mov_b32_e32 v2, v0 4504; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4505; VI-NEXT: s_cbranch_execnz .LBB84_1 4506; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4507; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4508; VI-NEXT: s_setpc_b64 s[30:31] 4509; 4510; GFX9-LABEL: global_atomic_max_i64_noret_scalar: 4511; GFX9: ; %bb.0: 4512; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4513; GFX9-NEXT: v_mov_b32_e32 v4, 0 4514; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 4515; GFX9-NEXT: s_mov_b64 s[34:35], 0 4516; GFX9-NEXT: v_mov_b32_e32 v5, s7 4517; GFX9-NEXT: v_mov_b32_e32 v6, s6 4518; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start 4519; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4520; GFX9-NEXT: s_waitcnt vmcnt(0) 4521; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] 4522; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 4523; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 4524; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc 4525; GFX9-NEXT: s_waitcnt vmcnt(0) 4526; GFX9-NEXT: buffer_wbinvl1_vol 4527; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4528; GFX9-NEXT: v_mov_b32_e32 v3, v1 4529; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4530; GFX9-NEXT: v_mov_b32_e32 v2, v0 4531; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4532; GFX9-NEXT: s_cbranch_execnz .LBB84_1 4533; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4534; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4535; GFX9-NEXT: s_setpc_b64 s[30:31] 4536 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst 4537 ret void 4538} 4539 4540define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 4541; SI-LABEL: global_atomic_max_i64_noret_offset_scalar: 4542; SI: ; %bb.0: 4543; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4544; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4545; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 4546; SI-NEXT: s_mov_b64 exec, s[34:35] 4547; SI-NEXT: s_waitcnt expcnt(0) 4548; SI-NEXT: v_writelane_b32 v10, s6, 0 4549; SI-NEXT: v_writelane_b32 v10, s7, 1 4550; SI-NEXT: s_mov_b32 s35, s7 4551; SI-NEXT: s_mov_b32 s34, s6 4552; SI-NEXT: s_mov_b32 s7, 0xf000 4553; SI-NEXT: s_mov_b32 s6, -1 4554; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 4555; SI-NEXT: s_mov_b64 s[36:37], 0 4556; SI-NEXT: v_mov_b32_e32 v4, s35 4557; SI-NEXT: v_mov_b32_e32 v5, s34 4558; SI-NEXT: .LBB85_1: ; %atomicrmw.start 4559; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4560; SI-NEXT: s_waitcnt vmcnt(0) 4561; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] 4562; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4563; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 4564; SI-NEXT: s_waitcnt expcnt(0) 4565; SI-NEXT: v_mov_b32_e32 v9, v3 4566; SI-NEXT: v_mov_b32_e32 v8, v2 4567; SI-NEXT: v_mov_b32_e32 v7, v1 4568; SI-NEXT: v_mov_b32_e32 v6, v0 4569; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 4570; SI-NEXT: s_waitcnt vmcnt(0) 4571; SI-NEXT: buffer_wbinvl1 4572; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 4573; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4574; SI-NEXT: v_mov_b32_e32 v2, v6 4575; SI-NEXT: v_mov_b32_e32 v3, v7 4576; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4577; SI-NEXT: s_cbranch_execnz .LBB85_1 4578; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4579; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4580; SI-NEXT: v_readlane_b32 s7, v10, 1 4581; SI-NEXT: v_readlane_b32 s6, v10, 0 4582; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4583; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 4584; SI-NEXT: s_mov_b64 exec, s[34:35] 4585; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4586; SI-NEXT: s_setpc_b64 s[30:31] 4587; 4588; VI-LABEL: global_atomic_max_i64_noret_offset_scalar: 4589; VI: ; %bb.0: 4590; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4591; VI-NEXT: s_add_u32 s34, s4, 32 4592; VI-NEXT: s_addc_u32 s35, s5, 0 4593; VI-NEXT: v_mov_b32_e32 v4, s34 4594; VI-NEXT: v_mov_b32_e32 v5, s35 4595; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] 4596; VI-NEXT: s_mov_b64 s[34:35], 0 4597; VI-NEXT: v_mov_b32_e32 v6, s7 4598; VI-NEXT: v_mov_b32_e32 v7, s6 4599; VI-NEXT: .LBB85_1: ; %atomicrmw.start 4600; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4601; VI-NEXT: s_waitcnt vmcnt(0) 4602; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] 4603; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 4604; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 4605; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 4606; VI-NEXT: s_waitcnt vmcnt(0) 4607; VI-NEXT: buffer_wbinvl1_vol 4608; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4609; VI-NEXT: v_mov_b32_e32 v3, v1 4610; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4611; VI-NEXT: v_mov_b32_e32 v2, v0 4612; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4613; VI-NEXT: s_cbranch_execnz .LBB85_1 4614; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4615; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4616; VI-NEXT: s_setpc_b64 s[30:31] 4617; 4618; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar: 4619; GFX9: ; %bb.0: 4620; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4621; GFX9-NEXT: v_mov_b32_e32 v4, 0 4622; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 4623; GFX9-NEXT: s_mov_b64 s[34:35], 0 4624; GFX9-NEXT: v_mov_b32_e32 v5, s7 4625; GFX9-NEXT: v_mov_b32_e32 v6, s6 4626; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start 4627; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4628; GFX9-NEXT: s_waitcnt vmcnt(0) 4629; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] 4630; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 4631; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 4632; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc 4633; GFX9-NEXT: s_waitcnt vmcnt(0) 4634; GFX9-NEXT: buffer_wbinvl1_vol 4635; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4636; GFX9-NEXT: v_mov_b32_e32 v3, v1 4637; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4638; GFX9-NEXT: v_mov_b32_e32 v2, v0 4639; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4640; GFX9-NEXT: s_cbranch_execnz .LBB85_1 4641; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4642; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4643; GFX9-NEXT: s_setpc_b64 s[30:31] 4644 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 4645 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst 4646 ret void 4647} 4648 4649define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 4650; SI-LABEL: global_atomic_max_i64_ret_scalar: 4651; SI: ; %bb.0: 4652; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4653; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4654; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 4655; SI-NEXT: s_mov_b64 exec, s[34:35] 4656; SI-NEXT: s_waitcnt expcnt(0) 4657; SI-NEXT: v_writelane_b32 v10, s6, 0 4658; SI-NEXT: v_writelane_b32 v10, s7, 1 4659; SI-NEXT: s_mov_b32 s35, s7 4660; SI-NEXT: s_mov_b32 s34, s6 4661; SI-NEXT: s_mov_b32 s7, 0xf000 4662; SI-NEXT: s_mov_b32 s6, -1 4663; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 4664; SI-NEXT: s_mov_b64 s[36:37], 0 4665; SI-NEXT: v_mov_b32_e32 v4, s35 4666; SI-NEXT: v_mov_b32_e32 v5, s34 4667; SI-NEXT: .LBB86_1: ; %atomicrmw.start 4668; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4669; SI-NEXT: s_waitcnt vmcnt(0) 4670; SI-NEXT: v_mov_b32_e32 v9, v1 4671; SI-NEXT: v_mov_b32_e32 v8, v0 4672; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] 4673; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 4674; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 4675; SI-NEXT: s_waitcnt expcnt(0) 4676; SI-NEXT: v_mov_b32_e32 v0, v6 4677; SI-NEXT: v_mov_b32_e32 v1, v7 4678; SI-NEXT: v_mov_b32_e32 v2, v8 4679; SI-NEXT: v_mov_b32_e32 v3, v9 4680; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc 4681; SI-NEXT: s_waitcnt vmcnt(0) 4682; SI-NEXT: buffer_wbinvl1 4683; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4684; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4685; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4686; SI-NEXT: s_cbranch_execnz .LBB86_1 4687; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4688; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4689; SI-NEXT: v_readlane_b32 s7, v10, 1 4690; SI-NEXT: v_readlane_b32 s6, v10, 0 4691; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4692; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 4693; SI-NEXT: s_mov_b64 exec, s[34:35] 4694; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4695; SI-NEXT: s_setpc_b64 s[30:31] 4696; 4697; VI-LABEL: global_atomic_max_i64_ret_scalar: 4698; VI: ; %bb.0: 4699; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4700; VI-NEXT: v_mov_b32_e32 v0, s4 4701; VI-NEXT: v_mov_b32_e32 v1, s5 4702; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 4703; VI-NEXT: v_mov_b32_e32 v2, s4 4704; VI-NEXT: s_mov_b64 s[34:35], 0 4705; VI-NEXT: v_mov_b32_e32 v4, s7 4706; VI-NEXT: v_mov_b32_e32 v5, s6 4707; VI-NEXT: v_mov_b32_e32 v3, s5 4708; VI-NEXT: .LBB86_1: ; %atomicrmw.start 4709; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4710; VI-NEXT: s_waitcnt vmcnt(0) 4711; VI-NEXT: v_mov_b32_e32 v9, v1 4712; VI-NEXT: v_mov_b32_e32 v8, v0 4713; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] 4714; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 4715; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 4716; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 4717; VI-NEXT: s_waitcnt vmcnt(0) 4718; VI-NEXT: buffer_wbinvl1_vol 4719; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4720; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4721; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4722; VI-NEXT: s_cbranch_execnz .LBB86_1 4723; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4724; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4725; VI-NEXT: s_setpc_b64 s[30:31] 4726; 4727; GFX9-LABEL: global_atomic_max_i64_ret_scalar: 4728; GFX9: ; %bb.0: 4729; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4730; GFX9-NEXT: v_mov_b32_e32 v2, 0 4731; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] 4732; GFX9-NEXT: s_mov_b64 s[34:35], 0 4733; GFX9-NEXT: v_mov_b32_e32 v3, s7 4734; GFX9-NEXT: v_mov_b32_e32 v4, s6 4735; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start 4736; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4737; GFX9-NEXT: s_waitcnt vmcnt(0) 4738; GFX9-NEXT: v_mov_b32_e32 v8, v1 4739; GFX9-NEXT: v_mov_b32_e32 v7, v0 4740; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] 4741; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 4742; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 4743; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc 4744; GFX9-NEXT: s_waitcnt vmcnt(0) 4745; GFX9-NEXT: buffer_wbinvl1_vol 4746; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 4747; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4748; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4749; GFX9-NEXT: s_cbranch_execnz .LBB86_1 4750; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4751; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4752; GFX9-NEXT: s_setpc_b64 s[30:31] 4753 %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst 4754 ret i64 %result 4755} 4756 4757define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 4758; SI-LABEL: global_atomic_max_i64_ret_offset_scalar: 4759; SI: ; %bb.0: 4760; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4761; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4762; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 4763; SI-NEXT: s_mov_b64 exec, s[34:35] 4764; SI-NEXT: s_waitcnt expcnt(0) 4765; SI-NEXT: v_writelane_b32 v10, s6, 0 4766; SI-NEXT: v_writelane_b32 v10, s7, 1 4767; SI-NEXT: s_mov_b32 s35, s7 4768; SI-NEXT: s_mov_b32 s34, s6 4769; SI-NEXT: s_mov_b32 s7, 0xf000 4770; SI-NEXT: s_mov_b32 s6, -1 4771; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 4772; SI-NEXT: s_mov_b64 s[36:37], 0 4773; SI-NEXT: v_mov_b32_e32 v4, s35 4774; SI-NEXT: v_mov_b32_e32 v5, s34 4775; SI-NEXT: .LBB87_1: ; %atomicrmw.start 4776; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4777; SI-NEXT: s_waitcnt vmcnt(0) 4778; SI-NEXT: v_mov_b32_e32 v9, v1 4779; SI-NEXT: v_mov_b32_e32 v8, v0 4780; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] 4781; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 4782; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 4783; SI-NEXT: s_waitcnt expcnt(0) 4784; SI-NEXT: v_mov_b32_e32 v0, v6 4785; SI-NEXT: v_mov_b32_e32 v1, v7 4786; SI-NEXT: v_mov_b32_e32 v2, v8 4787; SI-NEXT: v_mov_b32_e32 v3, v9 4788; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc 4789; SI-NEXT: s_waitcnt vmcnt(0) 4790; SI-NEXT: buffer_wbinvl1 4791; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4792; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 4793; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 4794; SI-NEXT: s_cbranch_execnz .LBB87_1 4795; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4796; SI-NEXT: s_or_b64 exec, exec, s[36:37] 4797; SI-NEXT: v_readlane_b32 s7, v10, 1 4798; SI-NEXT: v_readlane_b32 s6, v10, 0 4799; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 4800; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 4801; SI-NEXT: s_mov_b64 exec, s[34:35] 4802; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4803; SI-NEXT: s_setpc_b64 s[30:31] 4804; 4805; VI-LABEL: global_atomic_max_i64_ret_offset_scalar: 4806; VI: ; %bb.0: 4807; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4808; VI-NEXT: s_add_u32 s34, s4, 32 4809; VI-NEXT: s_addc_u32 s35, s5, 0 4810; VI-NEXT: v_mov_b32_e32 v2, s34 4811; VI-NEXT: v_mov_b32_e32 v3, s35 4812; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] 4813; VI-NEXT: s_mov_b64 s[34:35], 0 4814; VI-NEXT: v_mov_b32_e32 v4, s7 4815; VI-NEXT: v_mov_b32_e32 v5, s6 4816; VI-NEXT: .LBB87_1: ; %atomicrmw.start 4817; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4818; VI-NEXT: s_waitcnt vmcnt(0) 4819; VI-NEXT: v_mov_b32_e32 v9, v1 4820; VI-NEXT: v_mov_b32_e32 v8, v0 4821; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] 4822; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 4823; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 4824; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 4825; VI-NEXT: s_waitcnt vmcnt(0) 4826; VI-NEXT: buffer_wbinvl1_vol 4827; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4828; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4829; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 4830; VI-NEXT: s_cbranch_execnz .LBB87_1 4831; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4832; VI-NEXT: s_or_b64 exec, exec, s[34:35] 4833; VI-NEXT: s_setpc_b64 s[30:31] 4834; 4835; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar: 4836; GFX9: ; %bb.0: 4837; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4838; GFX9-NEXT: v_mov_b32_e32 v2, 0 4839; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 4840; GFX9-NEXT: s_mov_b64 s[34:35], 0 4841; GFX9-NEXT: v_mov_b32_e32 v3, s7 4842; GFX9-NEXT: v_mov_b32_e32 v4, s6 4843; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start 4844; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4845; GFX9-NEXT: s_waitcnt vmcnt(0) 4846; GFX9-NEXT: v_mov_b32_e32 v8, v1 4847; GFX9-NEXT: v_mov_b32_e32 v7, v0 4848; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] 4849; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 4850; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 4851; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc 4852; GFX9-NEXT: s_waitcnt vmcnt(0) 4853; GFX9-NEXT: buffer_wbinvl1_vol 4854; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 4855; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4856; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 4857; GFX9-NEXT: s_cbranch_execnz .LBB87_1 4858; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4859; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 4860; GFX9-NEXT: s_setpc_b64 s[30:31] 4861 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 4862 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst 4863 ret i64 %result 4864} 4865 4866define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { 4867; SI-LABEL: atomic_max_i64_addr64_offset: 4868; SI: ; %bb.0: ; %entry 4869; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 4870; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4871; SI-NEXT: s_waitcnt lgkmcnt(0) 4872; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 4873; SI-NEXT: s_add_u32 s4, s0, s4 4874; SI-NEXT: s_addc_u32 s5, s1, s5 4875; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 4876; SI-NEXT: s_mov_b64 s[0:1], 0 4877; SI-NEXT: s_mov_b32 s7, 0xf000 4878; SI-NEXT: v_mov_b32_e32 v4, s3 4879; SI-NEXT: v_mov_b32_e32 v5, s2 4880; SI-NEXT: s_waitcnt lgkmcnt(0) 4881; SI-NEXT: v_mov_b32_e32 v2, s8 4882; SI-NEXT: v_mov_b32_e32 v3, s9 4883; SI-NEXT: s_mov_b32 s6, -1 4884; SI-NEXT: .LBB88_1: ; %atomicrmw.start 4885; SI-NEXT: ; =>This Inner Loop Header: Depth=1 4886; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 4887; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4888; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 4889; SI-NEXT: s_waitcnt expcnt(0) 4890; SI-NEXT: v_mov_b32_e32 v9, v3 4891; SI-NEXT: v_mov_b32_e32 v8, v2 4892; SI-NEXT: v_mov_b32_e32 v7, v1 4893; SI-NEXT: v_mov_b32_e32 v6, v0 4894; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 4895; SI-NEXT: s_waitcnt vmcnt(0) 4896; SI-NEXT: buffer_wbinvl1 4897; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 4898; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4899; SI-NEXT: v_mov_b32_e32 v2, v6 4900; SI-NEXT: v_mov_b32_e32 v3, v7 4901; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 4902; SI-NEXT: s_cbranch_execnz .LBB88_1 4903; SI-NEXT: ; %bb.2: ; %atomicrmw.end 4904; SI-NEXT: s_endpgm 4905; 4906; VI-LABEL: atomic_max_i64_addr64_offset: 4907; VI: ; %bb.0: ; %entry 4908; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4909; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4910; VI-NEXT: s_mov_b64 s[4:5], 0 4911; VI-NEXT: s_waitcnt lgkmcnt(0) 4912; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 4913; VI-NEXT: s_add_u32 s0, s0, s6 4914; VI-NEXT: s_addc_u32 s1, s1, s7 4915; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 4916; VI-NEXT: s_add_u32 s0, s0, 32 4917; VI-NEXT: s_addc_u32 s1, s1, 0 4918; VI-NEXT: v_mov_b32_e32 v5, s1 4919; VI-NEXT: v_mov_b32_e32 v6, s3 4920; VI-NEXT: s_waitcnt lgkmcnt(0) 4921; VI-NEXT: v_mov_b32_e32 v2, s6 4922; VI-NEXT: v_mov_b32_e32 v7, s2 4923; VI-NEXT: v_mov_b32_e32 v3, s7 4924; VI-NEXT: v_mov_b32_e32 v4, s0 4925; VI-NEXT: .LBB88_1: ; %atomicrmw.start 4926; VI-NEXT: ; =>This Inner Loop Header: Depth=1 4927; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 4928; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 4929; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 4930; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 4931; VI-NEXT: s_waitcnt vmcnt(0) 4932; VI-NEXT: buffer_wbinvl1_vol 4933; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4934; VI-NEXT: v_mov_b32_e32 v3, v1 4935; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4936; VI-NEXT: v_mov_b32_e32 v2, v0 4937; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 4938; VI-NEXT: s_cbranch_execnz .LBB88_1 4939; VI-NEXT: ; %bb.2: ; %atomicrmw.end 4940; VI-NEXT: s_endpgm 4941; 4942; GFX9-LABEL: atomic_max_i64_addr64_offset: 4943; GFX9: ; %bb.0: ; %entry 4944; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4945; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4946; GFX9-NEXT: v_mov_b32_e32 v6, 0 4947; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4948; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 4949; GFX9-NEXT: s_add_u32 s0, s0, s4 4950; GFX9-NEXT: s_addc_u32 s1, s1, s5 4951; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 4952; GFX9-NEXT: s_mov_b64 s[4:5], 0 4953; GFX9-NEXT: v_mov_b32_e32 v4, s3 4954; GFX9-NEXT: v_mov_b32_e32 v5, s2 4955; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4956; GFX9-NEXT: v_mov_b32_e32 v2, s6 4957; GFX9-NEXT: v_mov_b32_e32 v3, s7 4958; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start 4959; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 4960; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 4961; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4962; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 4963; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc 4964; GFX9-NEXT: s_waitcnt vmcnt(0) 4965; GFX9-NEXT: buffer_wbinvl1_vol 4966; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4967; GFX9-NEXT: v_mov_b32_e32 v3, v1 4968; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4969; GFX9-NEXT: v_mov_b32_e32 v2, v0 4970; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 4971; GFX9-NEXT: s_cbranch_execnz .LBB88_1 4972; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 4973; GFX9-NEXT: s_endpgm 4974entry: 4975 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 4976 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 4977 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst 4978 ret void 4979} 4980 4981define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { 4982; SI-LABEL: atomic_max_i64_ret_addr64_offset: 4983; SI: ; %bb.0: ; %entry 4984; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 4985; SI-NEXT: s_waitcnt lgkmcnt(0) 4986; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 4987; SI-NEXT: s_add_u32 s8, s0, s6 4988; SI-NEXT: s_addc_u32 s9, s1, s7 4989; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 4990; SI-NEXT: s_mov_b64 s[0:1], 0 4991; SI-NEXT: s_mov_b32 s11, 0xf000 4992; SI-NEXT: v_mov_b32_e32 v8, s5 4993; SI-NEXT: v_mov_b32_e32 v9, s4 4994; SI-NEXT: s_waitcnt lgkmcnt(0) 4995; SI-NEXT: v_mov_b32_e32 v2, s6 4996; SI-NEXT: v_mov_b32_e32 v3, s7 4997; SI-NEXT: s_mov_b32 s10, -1 4998; SI-NEXT: .LBB89_1: ; %atomicrmw.start 4999; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5000; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] 5001; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc 5002; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc 5003; SI-NEXT: s_waitcnt expcnt(0) 5004; SI-NEXT: v_mov_b32_e32 v7, v3 5005; SI-NEXT: v_mov_b32_e32 v6, v2 5006; SI-NEXT: v_mov_b32_e32 v5, v1 5007; SI-NEXT: v_mov_b32_e32 v4, v0 5008; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc 5009; SI-NEXT: s_waitcnt vmcnt(0) 5010; SI-NEXT: buffer_wbinvl1 5011; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 5012; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5013; SI-NEXT: v_mov_b32_e32 v2, v4 5014; SI-NEXT: v_mov_b32_e32 v3, v5 5015; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5016; SI-NEXT: s_cbranch_execnz .LBB89_1 5017; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5018; SI-NEXT: s_or_b64 exec, exec, s[0:1] 5019; SI-NEXT: s_mov_b32 s7, 0xf000 5020; SI-NEXT: s_mov_b32 s6, -1 5021; SI-NEXT: s_mov_b32 s4, s2 5022; SI-NEXT: s_mov_b32 s5, s3 5023; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 5024; SI-NEXT: s_endpgm 5025; 5026; VI-LABEL: atomic_max_i64_ret_addr64_offset: 5027; VI: ; %bb.0: ; %entry 5028; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 5029; VI-NEXT: s_mov_b64 s[8:9], 0 5030; VI-NEXT: s_waitcnt lgkmcnt(0) 5031; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 5032; VI-NEXT: s_add_u32 s0, s0, s6 5033; VI-NEXT: s_addc_u32 s1, s1, s7 5034; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 5035; VI-NEXT: s_add_u32 s0, s0, 32 5036; VI-NEXT: s_addc_u32 s1, s1, 0 5037; VI-NEXT: v_mov_b32_e32 v0, s0 5038; VI-NEXT: v_mov_b32_e32 v4, s5 5039; VI-NEXT: s_waitcnt lgkmcnt(0) 5040; VI-NEXT: v_mov_b32_e32 v2, s6 5041; VI-NEXT: v_mov_b32_e32 v5, s4 5042; VI-NEXT: v_mov_b32_e32 v3, s7 5043; VI-NEXT: v_mov_b32_e32 v1, s1 5044; VI-NEXT: .LBB89_1: ; %atomicrmw.start 5045; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5046; VI-NEXT: v_mov_b32_e32 v9, v3 5047; VI-NEXT: v_mov_b32_e32 v8, v2 5048; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] 5049; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 5050; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 5051; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5052; VI-NEXT: s_waitcnt vmcnt(0) 5053; VI-NEXT: buffer_wbinvl1_vol 5054; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5055; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5056; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5057; VI-NEXT: s_cbranch_execnz .LBB89_1 5058; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5059; VI-NEXT: s_or_b64 exec, exec, s[8:9] 5060; VI-NEXT: v_mov_b32_e32 v0, s2 5061; VI-NEXT: v_mov_b32_e32 v1, s3 5062; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5063; VI-NEXT: s_endpgm 5064; 5065; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: 5066; GFX9: ; %bb.0: ; %entry 5067; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 5068; GFX9-NEXT: s_mov_b64 s[2:3], 0 5069; GFX9-NEXT: v_mov_b32_e32 v4, 0 5070; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5071; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 5072; GFX9-NEXT: s_add_u32 s0, s8, s0 5073; GFX9-NEXT: s_addc_u32 s1, s9, s1 5074; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 5075; GFX9-NEXT: v_mov_b32_e32 v2, s13 5076; GFX9-NEXT: v_mov_b32_e32 v3, s12 5077; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5078; GFX9-NEXT: v_mov_b32_e32 v0, s4 5079; GFX9-NEXT: v_mov_b32_e32 v1, s5 5080; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start 5081; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5082; GFX9-NEXT: v_mov_b32_e32 v8, v1 5083; GFX9-NEXT: v_mov_b32_e32 v7, v0 5084; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] 5085; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 5086; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5087; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc 5088; GFX9-NEXT: s_waitcnt vmcnt(0) 5089; GFX9-NEXT: buffer_wbinvl1_vol 5090; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 5091; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 5092; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 5093; GFX9-NEXT: s_cbranch_execnz .LBB89_1 5094; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5095; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5096; GFX9-NEXT: v_mov_b32_e32 v2, 0 5097; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] 5098; GFX9-NEXT: s_endpgm 5099entry: 5100 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 5101 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 5102 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst 5103 store i64 %tmp0, ptr addrspace(1) %out2 5104 ret void 5105} 5106 5107define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { 5108; SI-LABEL: atomic_max_i64_addr64: 5109; SI: ; %bb.0: ; %entry 5110; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5111; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5112; SI-NEXT: s_waitcnt lgkmcnt(0) 5113; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 5114; SI-NEXT: s_add_u32 s4, s0, s4 5115; SI-NEXT: s_addc_u32 s5, s1, s5 5116; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5117; SI-NEXT: s_mov_b64 s[0:1], 0 5118; SI-NEXT: s_mov_b32 s7, 0xf000 5119; SI-NEXT: v_mov_b32_e32 v4, s3 5120; SI-NEXT: v_mov_b32_e32 v5, s2 5121; SI-NEXT: s_waitcnt lgkmcnt(0) 5122; SI-NEXT: v_mov_b32_e32 v2, s8 5123; SI-NEXT: v_mov_b32_e32 v3, s9 5124; SI-NEXT: s_mov_b32 s6, -1 5125; SI-NEXT: .LBB90_1: ; %atomicrmw.start 5126; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5127; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 5128; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 5129; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 5130; SI-NEXT: s_waitcnt expcnt(0) 5131; SI-NEXT: v_mov_b32_e32 v9, v3 5132; SI-NEXT: v_mov_b32_e32 v8, v2 5133; SI-NEXT: v_mov_b32_e32 v7, v1 5134; SI-NEXT: v_mov_b32_e32 v6, v0 5135; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc 5136; SI-NEXT: s_waitcnt vmcnt(0) 5137; SI-NEXT: buffer_wbinvl1 5138; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5139; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5140; SI-NEXT: v_mov_b32_e32 v2, v6 5141; SI-NEXT: v_mov_b32_e32 v3, v7 5142; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5143; SI-NEXT: s_cbranch_execnz .LBB90_1 5144; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5145; SI-NEXT: s_endpgm 5146; 5147; VI-LABEL: atomic_max_i64_addr64: 5148; VI: ; %bb.0: ; %entry 5149; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5150; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5151; VI-NEXT: s_waitcnt lgkmcnt(0) 5152; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 5153; VI-NEXT: s_add_u32 s4, s0, s4 5154; VI-NEXT: s_addc_u32 s5, s1, s5 5155; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 5156; VI-NEXT: v_mov_b32_e32 v4, s4 5157; VI-NEXT: s_mov_b64 s[0:1], 0 5158; VI-NEXT: v_mov_b32_e32 v6, s3 5159; VI-NEXT: v_mov_b32_e32 v7, s2 5160; VI-NEXT: s_waitcnt lgkmcnt(0) 5161; VI-NEXT: v_mov_b32_e32 v2, s6 5162; VI-NEXT: v_mov_b32_e32 v3, s7 5163; VI-NEXT: v_mov_b32_e32 v5, s5 5164; VI-NEXT: .LBB90_1: ; %atomicrmw.start 5165; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5166; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 5167; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 5168; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 5169; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 5170; VI-NEXT: s_waitcnt vmcnt(0) 5171; VI-NEXT: buffer_wbinvl1_vol 5172; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 5173; VI-NEXT: v_mov_b32_e32 v3, v1 5174; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5175; VI-NEXT: v_mov_b32_e32 v2, v0 5176; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5177; VI-NEXT: s_cbranch_execnz .LBB90_1 5178; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5179; VI-NEXT: s_endpgm 5180; 5181; GFX9-LABEL: atomic_max_i64_addr64: 5182; GFX9: ; %bb.0: ; %entry 5183; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5184; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5185; GFX9-NEXT: v_mov_b32_e32 v6, 0 5186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5187; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 5188; GFX9-NEXT: s_add_u32 s0, s0, s4 5189; GFX9-NEXT: s_addc_u32 s1, s1, s5 5190; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 5191; GFX9-NEXT: s_mov_b64 s[4:5], 0 5192; GFX9-NEXT: v_mov_b32_e32 v4, s3 5193; GFX9-NEXT: v_mov_b32_e32 v5, s2 5194; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5195; GFX9-NEXT: v_mov_b32_e32 v2, s6 5196; GFX9-NEXT: v_mov_b32_e32 v3, s7 5197; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start 5198; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5199; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 5200; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 5201; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 5202; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc 5203; GFX9-NEXT: s_waitcnt vmcnt(0) 5204; GFX9-NEXT: buffer_wbinvl1_vol 5205; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 5206; GFX9-NEXT: v_mov_b32_e32 v3, v1 5207; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5208; GFX9-NEXT: v_mov_b32_e32 v2, v0 5209; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5210; GFX9-NEXT: s_cbranch_execnz .LBB90_1 5211; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5212; GFX9-NEXT: s_endpgm 5213entry: 5214 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 5215 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst 5216 ret void 5217} 5218 5219define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { 5220; SI-LABEL: atomic_max_i64_ret_addr64: 5221; SI: ; %bb.0: ; %entry 5222; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 5223; SI-NEXT: s_waitcnt lgkmcnt(0) 5224; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 5225; SI-NEXT: s_add_u32 s8, s0, s6 5226; SI-NEXT: s_addc_u32 s9, s1, s7 5227; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 5228; SI-NEXT: s_mov_b64 s[0:1], 0 5229; SI-NEXT: s_mov_b32 s11, 0xf000 5230; SI-NEXT: v_mov_b32_e32 v8, s5 5231; SI-NEXT: v_mov_b32_e32 v9, s4 5232; SI-NEXT: s_waitcnt lgkmcnt(0) 5233; SI-NEXT: v_mov_b32_e32 v2, s6 5234; SI-NEXT: v_mov_b32_e32 v3, s7 5235; SI-NEXT: s_mov_b32 s10, -1 5236; SI-NEXT: .LBB91_1: ; %atomicrmw.start 5237; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5238; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] 5239; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc 5240; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc 5241; SI-NEXT: s_waitcnt expcnt(0) 5242; SI-NEXT: v_mov_b32_e32 v7, v3 5243; SI-NEXT: v_mov_b32_e32 v6, v2 5244; SI-NEXT: v_mov_b32_e32 v5, v1 5245; SI-NEXT: v_mov_b32_e32 v4, v0 5246; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc 5247; SI-NEXT: s_waitcnt vmcnt(0) 5248; SI-NEXT: buffer_wbinvl1 5249; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 5250; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5251; SI-NEXT: v_mov_b32_e32 v2, v4 5252; SI-NEXT: v_mov_b32_e32 v3, v5 5253; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5254; SI-NEXT: s_cbranch_execnz .LBB91_1 5255; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5256; SI-NEXT: s_or_b64 exec, exec, s[0:1] 5257; SI-NEXT: s_mov_b32 s7, 0xf000 5258; SI-NEXT: s_mov_b32 s6, -1 5259; SI-NEXT: s_mov_b32 s4, s2 5260; SI-NEXT: s_mov_b32 s5, s3 5261; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 5262; SI-NEXT: s_endpgm 5263; 5264; VI-LABEL: atomic_max_i64_ret_addr64: 5265; VI: ; %bb.0: ; %entry 5266; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 5267; VI-NEXT: s_waitcnt lgkmcnt(0) 5268; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 5269; VI-NEXT: s_add_u32 s6, s0, s6 5270; VI-NEXT: s_addc_u32 s7, s1, s7 5271; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 5272; VI-NEXT: v_mov_b32_e32 v0, s6 5273; VI-NEXT: s_mov_b64 s[0:1], 0 5274; VI-NEXT: v_mov_b32_e32 v4, s5 5275; VI-NEXT: v_mov_b32_e32 v5, s4 5276; VI-NEXT: s_waitcnt lgkmcnt(0) 5277; VI-NEXT: v_mov_b32_e32 v2, s8 5278; VI-NEXT: v_mov_b32_e32 v3, s9 5279; VI-NEXT: v_mov_b32_e32 v1, s7 5280; VI-NEXT: .LBB91_1: ; %atomicrmw.start 5281; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5282; VI-NEXT: v_mov_b32_e32 v9, v3 5283; VI-NEXT: v_mov_b32_e32 v8, v2 5284; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] 5285; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 5286; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 5287; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5288; VI-NEXT: s_waitcnt vmcnt(0) 5289; VI-NEXT: buffer_wbinvl1_vol 5290; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5291; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5292; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 5293; VI-NEXT: s_cbranch_execnz .LBB91_1 5294; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5295; VI-NEXT: s_or_b64 exec, exec, s[0:1] 5296; VI-NEXT: v_mov_b32_e32 v0, s2 5297; VI-NEXT: v_mov_b32_e32 v1, s3 5298; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5299; VI-NEXT: s_endpgm 5300; 5301; GFX9-LABEL: atomic_max_i64_ret_addr64: 5302; GFX9: ; %bb.0: ; %entry 5303; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 5304; GFX9-NEXT: s_mov_b64 s[2:3], 0 5305; GFX9-NEXT: v_mov_b32_e32 v4, 0 5306; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5307; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 5308; GFX9-NEXT: s_add_u32 s0, s8, s0 5309; GFX9-NEXT: s_addc_u32 s1, s9, s1 5310; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 5311; GFX9-NEXT: v_mov_b32_e32 v2, s13 5312; GFX9-NEXT: v_mov_b32_e32 v3, s12 5313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5314; GFX9-NEXT: v_mov_b32_e32 v0, s4 5315; GFX9-NEXT: v_mov_b32_e32 v1, s5 5316; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start 5317; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5318; GFX9-NEXT: v_mov_b32_e32 v8, v1 5319; GFX9-NEXT: v_mov_b32_e32 v7, v0 5320; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] 5321; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 5322; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5323; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc 5324; GFX9-NEXT: s_waitcnt vmcnt(0) 5325; GFX9-NEXT: buffer_wbinvl1_vol 5326; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 5327; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 5328; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 5329; GFX9-NEXT: s_cbranch_execnz .LBB91_1 5330; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5331; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5332; GFX9-NEXT: v_mov_b32_e32 v2, 0 5333; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] 5334; GFX9-NEXT: s_endpgm 5335entry: 5336 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 5337 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst 5338 store i64 %tmp0, ptr addrspace(1) %out2 5339 ret void 5340} 5341 5342define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 5343; SI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: 5344; SI: ; %bb.0: 5345; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5346; SI-NEXT: s_mov_b32 s6, 0 5347; SI-NEXT: s_mov_b32 s7, 0xf000 5348; SI-NEXT: s_mov_b32 s4, s6 5349; SI-NEXT: s_mov_b32 s5, s6 5350; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 5351; SI-NEXT: s_mov_b64 s[8:9], 0 5352; SI-NEXT: .LBB92_1: ; %atomicrmw.start 5353; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5354; SI-NEXT: s_waitcnt vmcnt(0) 5355; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 5356; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5357; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5358; SI-NEXT: s_waitcnt expcnt(0) 5359; SI-NEXT: v_mov_b32_e32 v11, v7 5360; SI-NEXT: v_mov_b32_e32 v10, v6 5361; SI-NEXT: v_mov_b32_e32 v9, v5 5362; SI-NEXT: v_mov_b32_e32 v8, v4 5363; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 5364; SI-NEXT: s_waitcnt vmcnt(0) 5365; SI-NEXT: buffer_wbinvl1 5366; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 5367; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5368; SI-NEXT: v_mov_b32_e32 v6, v8 5369; SI-NEXT: v_mov_b32_e32 v7, v9 5370; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5371; SI-NEXT: s_cbranch_execnz .LBB92_1 5372; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5373; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5374; SI-NEXT: s_waitcnt expcnt(0) 5375; SI-NEXT: s_setpc_b64 s[30:31] 5376; 5377; VI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: 5378; VI: ; %bb.0: 5379; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5380; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 5381; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5382; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5383; VI-NEXT: s_mov_b64 s[4:5], 0 5384; VI-NEXT: .LBB92_1: ; %atomicrmw.start 5385; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5386; VI-NEXT: s_waitcnt vmcnt(0) 5387; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 5388; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5389; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5390; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5391; VI-NEXT: s_waitcnt vmcnt(0) 5392; VI-NEXT: buffer_wbinvl1_vol 5393; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5394; VI-NEXT: v_mov_b32_e32 v7, v5 5395; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5396; VI-NEXT: v_mov_b32_e32 v6, v4 5397; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5398; VI-NEXT: s_cbranch_execnz .LBB92_1 5399; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5400; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5401; VI-NEXT: s_setpc_b64 s[30:31] 5402; 5403; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: 5404; GFX9: ; %bb.0: 5405; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5406; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 5407; GFX9-NEXT: s_mov_b64 s[4:5], 0 5408; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start 5409; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5410; GFX9-NEXT: s_waitcnt vmcnt(0) 5411; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 5412; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5413; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5414; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 5415; GFX9-NEXT: s_waitcnt vmcnt(0) 5416; GFX9-NEXT: buffer_wbinvl1_vol 5417; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5418; GFX9-NEXT: v_mov_b32_e32 v7, v5 5419; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5420; GFX9-NEXT: v_mov_b32_e32 v6, v4 5421; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5422; GFX9-NEXT: s_cbranch_execnz .LBB92_1 5423; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5424; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5425; GFX9-NEXT: s_setpc_b64 s[30:31] 5426 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 5427 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 5428 ret void 5429} 5430 5431define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 5432; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: 5433; SI: ; %bb.0: 5434; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5435; SI-NEXT: v_mov_b32_e32 v5, v3 5436; SI-NEXT: v_mov_b32_e32 v4, v2 5437; SI-NEXT: v_mov_b32_e32 v7, v1 5438; SI-NEXT: v_mov_b32_e32 v6, v0 5439; SI-NEXT: s_mov_b32 s6, 0 5440; SI-NEXT: s_mov_b32 s7, 0xf000 5441; SI-NEXT: s_mov_b32 s4, s6 5442; SI-NEXT: s_mov_b32 s5, s6 5443; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 5444; SI-NEXT: s_mov_b64 s[8:9], 0 5445; SI-NEXT: .LBB93_1: ; %atomicrmw.start 5446; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5447; SI-NEXT: s_waitcnt vmcnt(0) 5448; SI-NEXT: v_mov_b32_e32 v11, v1 5449; SI-NEXT: v_mov_b32_e32 v10, v0 5450; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5] 5451; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 5452; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 5453; SI-NEXT: s_waitcnt expcnt(0) 5454; SI-NEXT: v_mov_b32_e32 v0, v8 5455; SI-NEXT: v_mov_b32_e32 v1, v9 5456; SI-NEXT: v_mov_b32_e32 v2, v10 5457; SI-NEXT: v_mov_b32_e32 v3, v11 5458; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 5459; SI-NEXT: s_waitcnt vmcnt(0) 5460; SI-NEXT: buffer_wbinvl1 5461; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 5462; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5463; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5464; SI-NEXT: s_cbranch_execnz .LBB93_1 5465; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5466; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5467; SI-NEXT: s_waitcnt expcnt(0) 5468; SI-NEXT: s_setpc_b64 s[30:31] 5469; 5470; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: 5471; VI: ; %bb.0: 5472; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5473; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 5474; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5475; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 5476; VI-NEXT: s_mov_b64 s[4:5], 0 5477; VI-NEXT: .LBB93_1: ; %atomicrmw.start 5478; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5479; VI-NEXT: s_waitcnt vmcnt(0) 5480; VI-NEXT: v_mov_b32_e32 v9, v1 5481; VI-NEXT: v_mov_b32_e32 v8, v0 5482; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] 5483; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 5484; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 5485; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 5486; VI-NEXT: s_waitcnt vmcnt(0) 5487; VI-NEXT: buffer_wbinvl1_vol 5488; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 5489; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5490; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5491; VI-NEXT: s_cbranch_execnz .LBB93_1 5492; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5493; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5494; VI-NEXT: s_setpc_b64 s[30:31] 5495; 5496; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: 5497; GFX9: ; %bb.0: 5498; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5499; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 5500; GFX9-NEXT: s_mov_b64 s[4:5], 0 5501; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start 5502; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5503; GFX9-NEXT: s_waitcnt vmcnt(0) 5504; GFX9-NEXT: v_mov_b32_e32 v7, v5 5505; GFX9-NEXT: v_mov_b32_e32 v6, v4 5506; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] 5507; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5508; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5509; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 5510; GFX9-NEXT: s_waitcnt vmcnt(0) 5511; GFX9-NEXT: buffer_wbinvl1_vol 5512; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5513; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5514; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5515; GFX9-NEXT: s_cbranch_execnz .LBB93_1 5516; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5517; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5518; GFX9-NEXT: v_mov_b32_e32 v0, v4 5519; GFX9-NEXT: v_mov_b32_e32 v1, v5 5520; GFX9-NEXT: s_setpc_b64 s[30:31] 5521 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 5522 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 5523 ret i64 %result 5524} 5525 5526; --------------------------------------------------------------------- 5527; atomicrmw umax 5528; --------------------------------------------------------------------- 5529 5530define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 5531; SI-LABEL: global_atomic_umax_i64_noret: 5532; SI: ; %bb.0: 5533; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5534; SI-NEXT: s_mov_b32 s6, 0 5535; SI-NEXT: s_mov_b32 s7, 0xf000 5536; SI-NEXT: s_mov_b32 s4, s6 5537; SI-NEXT: s_mov_b32 s5, s6 5538; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 5539; SI-NEXT: s_mov_b64 s[8:9], 0 5540; SI-NEXT: .LBB94_1: ; %atomicrmw.start 5541; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5542; SI-NEXT: s_waitcnt vmcnt(0) 5543; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5544; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5545; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5546; SI-NEXT: s_waitcnt expcnt(0) 5547; SI-NEXT: v_mov_b32_e32 v11, v7 5548; SI-NEXT: v_mov_b32_e32 v10, v6 5549; SI-NEXT: v_mov_b32_e32 v9, v5 5550; SI-NEXT: v_mov_b32_e32 v8, v4 5551; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc 5552; SI-NEXT: s_waitcnt vmcnt(0) 5553; SI-NEXT: buffer_wbinvl1 5554; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 5555; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5556; SI-NEXT: v_mov_b32_e32 v6, v8 5557; SI-NEXT: v_mov_b32_e32 v7, v9 5558; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5559; SI-NEXT: s_cbranch_execnz .LBB94_1 5560; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5561; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5562; SI-NEXT: s_waitcnt expcnt(0) 5563; SI-NEXT: s_setpc_b64 s[30:31] 5564; 5565; VI-LABEL: global_atomic_umax_i64_noret: 5566; VI: ; %bb.0: 5567; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5568; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5569; VI-NEXT: s_mov_b64 s[4:5], 0 5570; VI-NEXT: .LBB94_1: ; %atomicrmw.start 5571; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5572; VI-NEXT: s_waitcnt vmcnt(0) 5573; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5574; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5575; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5576; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5577; VI-NEXT: s_waitcnt vmcnt(0) 5578; VI-NEXT: buffer_wbinvl1_vol 5579; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5580; VI-NEXT: v_mov_b32_e32 v7, v5 5581; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5582; VI-NEXT: v_mov_b32_e32 v6, v4 5583; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5584; VI-NEXT: s_cbranch_execnz .LBB94_1 5585; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5586; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5587; VI-NEXT: s_setpc_b64 s[30:31] 5588; 5589; GFX9-LABEL: global_atomic_umax_i64_noret: 5590; GFX9: ; %bb.0: 5591; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5592; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 5593; GFX9-NEXT: s_mov_b64 s[4:5], 0 5594; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start 5595; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5596; GFX9-NEXT: s_waitcnt vmcnt(0) 5597; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5598; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5599; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5600; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 5601; GFX9-NEXT: s_waitcnt vmcnt(0) 5602; GFX9-NEXT: buffer_wbinvl1_vol 5603; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5604; GFX9-NEXT: v_mov_b32_e32 v7, v5 5605; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5606; GFX9-NEXT: v_mov_b32_e32 v6, v4 5607; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5608; GFX9-NEXT: s_cbranch_execnz .LBB94_1 5609; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5610; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5611; GFX9-NEXT: s_setpc_b64 s[30:31] 5612 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst 5613 ret void 5614} 5615 5616define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 5617; SI-LABEL: global_atomic_umax_i64_noret_offset: 5618; SI: ; %bb.0: 5619; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5620; SI-NEXT: s_mov_b32 s6, 0 5621; SI-NEXT: s_mov_b32 s7, 0xf000 5622; SI-NEXT: s_mov_b32 s4, s6 5623; SI-NEXT: s_mov_b32 s5, s6 5624; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 5625; SI-NEXT: s_mov_b64 s[8:9], 0 5626; SI-NEXT: .LBB95_1: ; %atomicrmw.start 5627; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5628; SI-NEXT: s_waitcnt vmcnt(0) 5629; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5630; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5631; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5632; SI-NEXT: s_waitcnt expcnt(0) 5633; SI-NEXT: v_mov_b32_e32 v11, v7 5634; SI-NEXT: v_mov_b32_e32 v10, v6 5635; SI-NEXT: v_mov_b32_e32 v9, v5 5636; SI-NEXT: v_mov_b32_e32 v8, v4 5637; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 5638; SI-NEXT: s_waitcnt vmcnt(0) 5639; SI-NEXT: buffer_wbinvl1 5640; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 5641; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5642; SI-NEXT: v_mov_b32_e32 v6, v8 5643; SI-NEXT: v_mov_b32_e32 v7, v9 5644; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5645; SI-NEXT: s_cbranch_execnz .LBB95_1 5646; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5647; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5648; SI-NEXT: s_waitcnt expcnt(0) 5649; SI-NEXT: s_setpc_b64 s[30:31] 5650; 5651; VI-LABEL: global_atomic_umax_i64_noret_offset: 5652; VI: ; %bb.0: 5653; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5654; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 5655; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5656; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5657; VI-NEXT: s_mov_b64 s[4:5], 0 5658; VI-NEXT: .LBB95_1: ; %atomicrmw.start 5659; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5660; VI-NEXT: s_waitcnt vmcnt(0) 5661; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5662; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5663; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5664; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5665; VI-NEXT: s_waitcnt vmcnt(0) 5666; VI-NEXT: buffer_wbinvl1_vol 5667; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5668; VI-NEXT: v_mov_b32_e32 v7, v5 5669; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5670; VI-NEXT: v_mov_b32_e32 v6, v4 5671; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5672; VI-NEXT: s_cbranch_execnz .LBB95_1 5673; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5674; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5675; VI-NEXT: s_setpc_b64 s[30:31] 5676; 5677; GFX9-LABEL: global_atomic_umax_i64_noret_offset: 5678; GFX9: ; %bb.0: 5679; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5680; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 5681; GFX9-NEXT: s_mov_b64 s[4:5], 0 5682; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start 5683; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5684; GFX9-NEXT: s_waitcnt vmcnt(0) 5685; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5686; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5687; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5688; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 5689; GFX9-NEXT: s_waitcnt vmcnt(0) 5690; GFX9-NEXT: buffer_wbinvl1_vol 5691; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5692; GFX9-NEXT: v_mov_b32_e32 v7, v5 5693; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5694; GFX9-NEXT: v_mov_b32_e32 v6, v4 5695; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5696; GFX9-NEXT: s_cbranch_execnz .LBB95_1 5697; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5698; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5699; GFX9-NEXT: s_setpc_b64 s[30:31] 5700 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 5701 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst 5702 ret void 5703} 5704 5705define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 5706; SI-LABEL: global_atomic_umax_i64_ret: 5707; SI: ; %bb.0: 5708; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5709; SI-NEXT: v_mov_b32_e32 v5, v3 5710; SI-NEXT: v_mov_b32_e32 v4, v2 5711; SI-NEXT: v_mov_b32_e32 v7, v1 5712; SI-NEXT: v_mov_b32_e32 v6, v0 5713; SI-NEXT: s_mov_b32 s6, 0 5714; SI-NEXT: s_mov_b32 s7, 0xf000 5715; SI-NEXT: s_mov_b32 s4, s6 5716; SI-NEXT: s_mov_b32 s5, s6 5717; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 5718; SI-NEXT: s_mov_b64 s[8:9], 0 5719; SI-NEXT: .LBB96_1: ; %atomicrmw.start 5720; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5721; SI-NEXT: s_waitcnt vmcnt(0) 5722; SI-NEXT: v_mov_b32_e32 v11, v1 5723; SI-NEXT: v_mov_b32_e32 v10, v0 5724; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5] 5725; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 5726; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 5727; SI-NEXT: s_waitcnt expcnt(0) 5728; SI-NEXT: v_mov_b32_e32 v0, v8 5729; SI-NEXT: v_mov_b32_e32 v1, v9 5730; SI-NEXT: v_mov_b32_e32 v2, v10 5731; SI-NEXT: v_mov_b32_e32 v3, v11 5732; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc 5733; SI-NEXT: s_waitcnt vmcnt(0) 5734; SI-NEXT: buffer_wbinvl1 5735; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 5736; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5737; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5738; SI-NEXT: s_cbranch_execnz .LBB96_1 5739; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5740; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5741; SI-NEXT: s_waitcnt expcnt(0) 5742; SI-NEXT: s_setpc_b64 s[30:31] 5743; 5744; VI-LABEL: global_atomic_umax_i64_ret: 5745; VI: ; %bb.0: 5746; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5747; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 5748; VI-NEXT: s_mov_b64 s[4:5], 0 5749; VI-NEXT: .LBB96_1: ; %atomicrmw.start 5750; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5751; VI-NEXT: s_waitcnt vmcnt(0) 5752; VI-NEXT: v_mov_b32_e32 v7, v5 5753; VI-NEXT: v_mov_b32_e32 v6, v4 5754; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5755; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5756; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5757; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5758; VI-NEXT: s_waitcnt vmcnt(0) 5759; VI-NEXT: buffer_wbinvl1_vol 5760; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5761; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5762; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5763; VI-NEXT: s_cbranch_execnz .LBB96_1 5764; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5765; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5766; VI-NEXT: v_mov_b32_e32 v0, v4 5767; VI-NEXT: v_mov_b32_e32 v1, v5 5768; VI-NEXT: s_setpc_b64 s[30:31] 5769; 5770; GFX9-LABEL: global_atomic_umax_i64_ret: 5771; GFX9: ; %bb.0: 5772; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5773; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 5774; GFX9-NEXT: s_mov_b64 s[4:5], 0 5775; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start 5776; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5777; GFX9-NEXT: s_waitcnt vmcnt(0) 5778; GFX9-NEXT: v_mov_b32_e32 v7, v5 5779; GFX9-NEXT: v_mov_b32_e32 v6, v4 5780; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5781; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5782; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5783; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 5784; GFX9-NEXT: s_waitcnt vmcnt(0) 5785; GFX9-NEXT: buffer_wbinvl1_vol 5786; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5787; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5788; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5789; GFX9-NEXT: s_cbranch_execnz .LBB96_1 5790; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5791; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5792; GFX9-NEXT: v_mov_b32_e32 v0, v4 5793; GFX9-NEXT: v_mov_b32_e32 v1, v5 5794; GFX9-NEXT: s_setpc_b64 s[30:31] 5795 %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst 5796 ret i64 %result 5797} 5798 5799define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 5800; SI-LABEL: global_atomic_umax_i64_ret_offset: 5801; SI: ; %bb.0: 5802; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5803; SI-NEXT: v_mov_b32_e32 v5, v3 5804; SI-NEXT: v_mov_b32_e32 v4, v2 5805; SI-NEXT: v_mov_b32_e32 v7, v1 5806; SI-NEXT: v_mov_b32_e32 v6, v0 5807; SI-NEXT: s_mov_b32 s6, 0 5808; SI-NEXT: s_mov_b32 s7, 0xf000 5809; SI-NEXT: s_mov_b32 s4, s6 5810; SI-NEXT: s_mov_b32 s5, s6 5811; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 5812; SI-NEXT: s_mov_b64 s[8:9], 0 5813; SI-NEXT: .LBB97_1: ; %atomicrmw.start 5814; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5815; SI-NEXT: s_waitcnt vmcnt(0) 5816; SI-NEXT: v_mov_b32_e32 v11, v1 5817; SI-NEXT: v_mov_b32_e32 v10, v0 5818; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5] 5819; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 5820; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 5821; SI-NEXT: s_waitcnt expcnt(0) 5822; SI-NEXT: v_mov_b32_e32 v0, v8 5823; SI-NEXT: v_mov_b32_e32 v1, v9 5824; SI-NEXT: v_mov_b32_e32 v2, v10 5825; SI-NEXT: v_mov_b32_e32 v3, v11 5826; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 5827; SI-NEXT: s_waitcnt vmcnt(0) 5828; SI-NEXT: buffer_wbinvl1 5829; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 5830; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 5831; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 5832; SI-NEXT: s_cbranch_execnz .LBB97_1 5833; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5834; SI-NEXT: s_or_b64 exec, exec, s[8:9] 5835; SI-NEXT: s_waitcnt expcnt(0) 5836; SI-NEXT: s_setpc_b64 s[30:31] 5837; 5838; VI-LABEL: global_atomic_umax_i64_ret_offset: 5839; VI: ; %bb.0: 5840; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5841; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 5842; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5843; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 5844; VI-NEXT: s_mov_b64 s[4:5], 0 5845; VI-NEXT: .LBB97_1: ; %atomicrmw.start 5846; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5847; VI-NEXT: s_waitcnt vmcnt(0) 5848; VI-NEXT: v_mov_b32_e32 v9, v1 5849; VI-NEXT: v_mov_b32_e32 v8, v0 5850; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] 5851; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 5852; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 5853; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 5854; VI-NEXT: s_waitcnt vmcnt(0) 5855; VI-NEXT: buffer_wbinvl1_vol 5856; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 5857; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5858; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 5859; VI-NEXT: s_cbranch_execnz .LBB97_1 5860; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5861; VI-NEXT: s_or_b64 exec, exec, s[4:5] 5862; VI-NEXT: s_setpc_b64 s[30:31] 5863; 5864; GFX9-LABEL: global_atomic_umax_i64_ret_offset: 5865; GFX9: ; %bb.0: 5866; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5867; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 5868; GFX9-NEXT: s_mov_b64 s[4:5], 0 5869; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start 5870; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5871; GFX9-NEXT: s_waitcnt vmcnt(0) 5872; GFX9-NEXT: v_mov_b32_e32 v7, v5 5873; GFX9-NEXT: v_mov_b32_e32 v6, v4 5874; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 5875; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 5876; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 5877; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 5878; GFX9-NEXT: s_waitcnt vmcnt(0) 5879; GFX9-NEXT: buffer_wbinvl1_vol 5880; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5881; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5882; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 5883; GFX9-NEXT: s_cbranch_execnz .LBB97_1 5884; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5885; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5886; GFX9-NEXT: v_mov_b32_e32 v0, v4 5887; GFX9-NEXT: v_mov_b32_e32 v1, v5 5888; GFX9-NEXT: s_setpc_b64 s[30:31] 5889 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 5890 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst 5891 ret i64 %result 5892} 5893 5894define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 5895; SI-LABEL: global_atomic_umax_i64_noret_scalar: 5896; SI: ; %bb.0: 5897; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5898; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5899; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 5900; SI-NEXT: s_mov_b64 exec, s[34:35] 5901; SI-NEXT: s_waitcnt expcnt(0) 5902; SI-NEXT: v_writelane_b32 v10, s6, 0 5903; SI-NEXT: v_writelane_b32 v10, s7, 1 5904; SI-NEXT: s_mov_b32 s35, s7 5905; SI-NEXT: s_mov_b32 s34, s6 5906; SI-NEXT: s_mov_b32 s7, 0xf000 5907; SI-NEXT: s_mov_b32 s6, -1 5908; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 5909; SI-NEXT: s_mov_b64 s[36:37], 0 5910; SI-NEXT: v_mov_b32_e32 v4, s35 5911; SI-NEXT: v_mov_b32_e32 v5, s34 5912; SI-NEXT: .LBB98_1: ; %atomicrmw.start 5913; SI-NEXT: ; =>This Inner Loop Header: Depth=1 5914; SI-NEXT: s_waitcnt vmcnt(0) 5915; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] 5916; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 5917; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 5918; SI-NEXT: s_waitcnt expcnt(0) 5919; SI-NEXT: v_mov_b32_e32 v9, v3 5920; SI-NEXT: v_mov_b32_e32 v8, v2 5921; SI-NEXT: v_mov_b32_e32 v7, v1 5922; SI-NEXT: v_mov_b32_e32 v6, v0 5923; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc 5924; SI-NEXT: s_waitcnt vmcnt(0) 5925; SI-NEXT: buffer_wbinvl1 5926; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5927; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 5928; SI-NEXT: v_mov_b32_e32 v2, v6 5929; SI-NEXT: v_mov_b32_e32 v3, v7 5930; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 5931; SI-NEXT: s_cbranch_execnz .LBB98_1 5932; SI-NEXT: ; %bb.2: ; %atomicrmw.end 5933; SI-NEXT: s_or_b64 exec, exec, s[36:37] 5934; SI-NEXT: v_readlane_b32 s7, v10, 1 5935; SI-NEXT: v_readlane_b32 s6, v10, 0 5936; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 5937; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 5938; SI-NEXT: s_mov_b64 exec, s[34:35] 5939; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5940; SI-NEXT: s_setpc_b64 s[30:31] 5941; 5942; VI-LABEL: global_atomic_umax_i64_noret_scalar: 5943; VI: ; %bb.0: 5944; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5945; VI-NEXT: v_mov_b32_e32 v0, s4 5946; VI-NEXT: v_mov_b32_e32 v1, s5 5947; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 5948; VI-NEXT: v_mov_b32_e32 v4, s4 5949; VI-NEXT: s_mov_b64 s[34:35], 0 5950; VI-NEXT: v_mov_b32_e32 v6, s7 5951; VI-NEXT: v_mov_b32_e32 v7, s6 5952; VI-NEXT: v_mov_b32_e32 v5, s5 5953; VI-NEXT: .LBB98_1: ; %atomicrmw.start 5954; VI-NEXT: ; =>This Inner Loop Header: Depth=1 5955; VI-NEXT: s_waitcnt vmcnt(0) 5956; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] 5957; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 5958; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 5959; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 5960; VI-NEXT: s_waitcnt vmcnt(0) 5961; VI-NEXT: buffer_wbinvl1_vol 5962; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 5963; VI-NEXT: v_mov_b32_e32 v3, v1 5964; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5965; VI-NEXT: v_mov_b32_e32 v2, v0 5966; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 5967; VI-NEXT: s_cbranch_execnz .LBB98_1 5968; VI-NEXT: ; %bb.2: ; %atomicrmw.end 5969; VI-NEXT: s_or_b64 exec, exec, s[34:35] 5970; VI-NEXT: s_setpc_b64 s[30:31] 5971; 5972; GFX9-LABEL: global_atomic_umax_i64_noret_scalar: 5973; GFX9: ; %bb.0: 5974; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5975; GFX9-NEXT: v_mov_b32_e32 v4, 0 5976; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 5977; GFX9-NEXT: s_mov_b64 s[34:35], 0 5978; GFX9-NEXT: v_mov_b32_e32 v5, s7 5979; GFX9-NEXT: v_mov_b32_e32 v6, s6 5980; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start 5981; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 5982; GFX9-NEXT: s_waitcnt vmcnt(0) 5983; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] 5984; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 5985; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 5986; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc 5987; GFX9-NEXT: s_waitcnt vmcnt(0) 5988; GFX9-NEXT: buffer_wbinvl1_vol 5989; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 5990; GFX9-NEXT: v_mov_b32_e32 v3, v1 5991; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5992; GFX9-NEXT: v_mov_b32_e32 v2, v0 5993; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 5994; GFX9-NEXT: s_cbranch_execnz .LBB98_1 5995; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 5996; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 5997; GFX9-NEXT: s_setpc_b64 s[30:31] 5998 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst 5999 ret void 6000} 6001 6002define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 6003; SI-LABEL: global_atomic_umax_i64_noret_offset_scalar: 6004; SI: ; %bb.0: 6005; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6006; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6007; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 6008; SI-NEXT: s_mov_b64 exec, s[34:35] 6009; SI-NEXT: s_waitcnt expcnt(0) 6010; SI-NEXT: v_writelane_b32 v10, s6, 0 6011; SI-NEXT: v_writelane_b32 v10, s7, 1 6012; SI-NEXT: s_mov_b32 s35, s7 6013; SI-NEXT: s_mov_b32 s34, s6 6014; SI-NEXT: s_mov_b32 s7, 0xf000 6015; SI-NEXT: s_mov_b32 s6, -1 6016; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 6017; SI-NEXT: s_mov_b64 s[36:37], 0 6018; SI-NEXT: v_mov_b32_e32 v4, s35 6019; SI-NEXT: v_mov_b32_e32 v5, s34 6020; SI-NEXT: .LBB99_1: ; %atomicrmw.start 6021; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6022; SI-NEXT: s_waitcnt vmcnt(0) 6023; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] 6024; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 6025; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 6026; SI-NEXT: s_waitcnt expcnt(0) 6027; SI-NEXT: v_mov_b32_e32 v9, v3 6028; SI-NEXT: v_mov_b32_e32 v8, v2 6029; SI-NEXT: v_mov_b32_e32 v7, v1 6030; SI-NEXT: v_mov_b32_e32 v6, v0 6031; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 6032; SI-NEXT: s_waitcnt vmcnt(0) 6033; SI-NEXT: buffer_wbinvl1 6034; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 6035; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6036; SI-NEXT: v_mov_b32_e32 v2, v6 6037; SI-NEXT: v_mov_b32_e32 v3, v7 6038; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6039; SI-NEXT: s_cbranch_execnz .LBB99_1 6040; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6041; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6042; SI-NEXT: v_readlane_b32 s7, v10, 1 6043; SI-NEXT: v_readlane_b32 s6, v10, 0 6044; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6045; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 6046; SI-NEXT: s_mov_b64 exec, s[34:35] 6047; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6048; SI-NEXT: s_setpc_b64 s[30:31] 6049; 6050; VI-LABEL: global_atomic_umax_i64_noret_offset_scalar: 6051; VI: ; %bb.0: 6052; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6053; VI-NEXT: s_add_u32 s34, s4, 32 6054; VI-NEXT: s_addc_u32 s35, s5, 0 6055; VI-NEXT: v_mov_b32_e32 v4, s34 6056; VI-NEXT: v_mov_b32_e32 v5, s35 6057; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] 6058; VI-NEXT: s_mov_b64 s[34:35], 0 6059; VI-NEXT: v_mov_b32_e32 v6, s7 6060; VI-NEXT: v_mov_b32_e32 v7, s6 6061; VI-NEXT: .LBB99_1: ; %atomicrmw.start 6062; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6063; VI-NEXT: s_waitcnt vmcnt(0) 6064; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] 6065; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 6066; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 6067; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 6068; VI-NEXT: s_waitcnt vmcnt(0) 6069; VI-NEXT: buffer_wbinvl1_vol 6070; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 6071; VI-NEXT: v_mov_b32_e32 v3, v1 6072; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6073; VI-NEXT: v_mov_b32_e32 v2, v0 6074; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6075; VI-NEXT: s_cbranch_execnz .LBB99_1 6076; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6077; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6078; VI-NEXT: s_setpc_b64 s[30:31] 6079; 6080; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar: 6081; GFX9: ; %bb.0: 6082; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6083; GFX9-NEXT: v_mov_b32_e32 v4, 0 6084; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 6085; GFX9-NEXT: s_mov_b64 s[34:35], 0 6086; GFX9-NEXT: v_mov_b32_e32 v5, s7 6087; GFX9-NEXT: v_mov_b32_e32 v6, s6 6088; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start 6089; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6090; GFX9-NEXT: s_waitcnt vmcnt(0) 6091; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] 6092; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 6093; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 6094; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc 6095; GFX9-NEXT: s_waitcnt vmcnt(0) 6096; GFX9-NEXT: buffer_wbinvl1_vol 6097; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 6098; GFX9-NEXT: v_mov_b32_e32 v3, v1 6099; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6100; GFX9-NEXT: v_mov_b32_e32 v2, v0 6101; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 6102; GFX9-NEXT: s_cbranch_execnz .LBB99_1 6103; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6104; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 6105; GFX9-NEXT: s_setpc_b64 s[30:31] 6106 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 6107 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst 6108 ret void 6109} 6110 6111define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 6112; SI-LABEL: global_atomic_umax_i64_ret_scalar: 6113; SI: ; %bb.0: 6114; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6115; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6116; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 6117; SI-NEXT: s_mov_b64 exec, s[34:35] 6118; SI-NEXT: s_waitcnt expcnt(0) 6119; SI-NEXT: v_writelane_b32 v10, s6, 0 6120; SI-NEXT: v_writelane_b32 v10, s7, 1 6121; SI-NEXT: s_mov_b32 s35, s7 6122; SI-NEXT: s_mov_b32 s34, s6 6123; SI-NEXT: s_mov_b32 s7, 0xf000 6124; SI-NEXT: s_mov_b32 s6, -1 6125; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 6126; SI-NEXT: s_mov_b64 s[36:37], 0 6127; SI-NEXT: v_mov_b32_e32 v4, s35 6128; SI-NEXT: v_mov_b32_e32 v5, s34 6129; SI-NEXT: .LBB100_1: ; %atomicrmw.start 6130; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6131; SI-NEXT: s_waitcnt vmcnt(0) 6132; SI-NEXT: v_mov_b32_e32 v9, v1 6133; SI-NEXT: v_mov_b32_e32 v8, v0 6134; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] 6135; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 6136; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 6137; SI-NEXT: s_waitcnt expcnt(0) 6138; SI-NEXT: v_mov_b32_e32 v0, v6 6139; SI-NEXT: v_mov_b32_e32 v1, v7 6140; SI-NEXT: v_mov_b32_e32 v2, v8 6141; SI-NEXT: v_mov_b32_e32 v3, v9 6142; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc 6143; SI-NEXT: s_waitcnt vmcnt(0) 6144; SI-NEXT: buffer_wbinvl1 6145; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6146; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6147; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6148; SI-NEXT: s_cbranch_execnz .LBB100_1 6149; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6150; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6151; SI-NEXT: v_readlane_b32 s7, v10, 1 6152; SI-NEXT: v_readlane_b32 s6, v10, 0 6153; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6154; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 6155; SI-NEXT: s_mov_b64 exec, s[34:35] 6156; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6157; SI-NEXT: s_setpc_b64 s[30:31] 6158; 6159; VI-LABEL: global_atomic_umax_i64_ret_scalar: 6160; VI: ; %bb.0: 6161; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6162; VI-NEXT: v_mov_b32_e32 v0, s4 6163; VI-NEXT: v_mov_b32_e32 v1, s5 6164; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 6165; VI-NEXT: v_mov_b32_e32 v2, s4 6166; VI-NEXT: s_mov_b64 s[34:35], 0 6167; VI-NEXT: v_mov_b32_e32 v4, s7 6168; VI-NEXT: v_mov_b32_e32 v5, s6 6169; VI-NEXT: v_mov_b32_e32 v3, s5 6170; VI-NEXT: .LBB100_1: ; %atomicrmw.start 6171; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6172; VI-NEXT: s_waitcnt vmcnt(0) 6173; VI-NEXT: v_mov_b32_e32 v9, v1 6174; VI-NEXT: v_mov_b32_e32 v8, v0 6175; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] 6176; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 6177; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 6178; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 6179; VI-NEXT: s_waitcnt vmcnt(0) 6180; VI-NEXT: buffer_wbinvl1_vol 6181; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6182; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6183; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6184; VI-NEXT: s_cbranch_execnz .LBB100_1 6185; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6186; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6187; VI-NEXT: s_setpc_b64 s[30:31] 6188; 6189; GFX9-LABEL: global_atomic_umax_i64_ret_scalar: 6190; GFX9: ; %bb.0: 6191; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6192; GFX9-NEXT: v_mov_b32_e32 v2, 0 6193; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] 6194; GFX9-NEXT: s_mov_b64 s[34:35], 0 6195; GFX9-NEXT: v_mov_b32_e32 v3, s7 6196; GFX9-NEXT: v_mov_b32_e32 v4, s6 6197; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start 6198; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6199; GFX9-NEXT: s_waitcnt vmcnt(0) 6200; GFX9-NEXT: v_mov_b32_e32 v8, v1 6201; GFX9-NEXT: v_mov_b32_e32 v7, v0 6202; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] 6203; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 6204; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 6205; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc 6206; GFX9-NEXT: s_waitcnt vmcnt(0) 6207; GFX9-NEXT: buffer_wbinvl1_vol 6208; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 6209; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6210; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 6211; GFX9-NEXT: s_cbranch_execnz .LBB100_1 6212; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6213; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 6214; GFX9-NEXT: s_setpc_b64 s[30:31] 6215 %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst 6216 ret i64 %result 6217} 6218 6219define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 6220; SI-LABEL: global_atomic_umax_i64_ret_offset_scalar: 6221; SI: ; %bb.0: 6222; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6223; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6224; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 6225; SI-NEXT: s_mov_b64 exec, s[34:35] 6226; SI-NEXT: s_waitcnt expcnt(0) 6227; SI-NEXT: v_writelane_b32 v10, s6, 0 6228; SI-NEXT: v_writelane_b32 v10, s7, 1 6229; SI-NEXT: s_mov_b32 s35, s7 6230; SI-NEXT: s_mov_b32 s34, s6 6231; SI-NEXT: s_mov_b32 s7, 0xf000 6232; SI-NEXT: s_mov_b32 s6, -1 6233; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 6234; SI-NEXT: s_mov_b64 s[36:37], 0 6235; SI-NEXT: v_mov_b32_e32 v4, s35 6236; SI-NEXT: v_mov_b32_e32 v5, s34 6237; SI-NEXT: .LBB101_1: ; %atomicrmw.start 6238; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6239; SI-NEXT: s_waitcnt vmcnt(0) 6240; SI-NEXT: v_mov_b32_e32 v9, v1 6241; SI-NEXT: v_mov_b32_e32 v8, v0 6242; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] 6243; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 6244; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 6245; SI-NEXT: s_waitcnt expcnt(0) 6246; SI-NEXT: v_mov_b32_e32 v0, v6 6247; SI-NEXT: v_mov_b32_e32 v1, v7 6248; SI-NEXT: v_mov_b32_e32 v2, v8 6249; SI-NEXT: v_mov_b32_e32 v3, v9 6250; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc 6251; SI-NEXT: s_waitcnt vmcnt(0) 6252; SI-NEXT: buffer_wbinvl1 6253; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6254; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 6255; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 6256; SI-NEXT: s_cbranch_execnz .LBB101_1 6257; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6258; SI-NEXT: s_or_b64 exec, exec, s[36:37] 6259; SI-NEXT: v_readlane_b32 s7, v10, 1 6260; SI-NEXT: v_readlane_b32 s6, v10, 0 6261; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 6262; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 6263; SI-NEXT: s_mov_b64 exec, s[34:35] 6264; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6265; SI-NEXT: s_setpc_b64 s[30:31] 6266; 6267; VI-LABEL: global_atomic_umax_i64_ret_offset_scalar: 6268; VI: ; %bb.0: 6269; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6270; VI-NEXT: s_add_u32 s34, s4, 32 6271; VI-NEXT: s_addc_u32 s35, s5, 0 6272; VI-NEXT: v_mov_b32_e32 v2, s34 6273; VI-NEXT: v_mov_b32_e32 v3, s35 6274; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] 6275; VI-NEXT: s_mov_b64 s[34:35], 0 6276; VI-NEXT: v_mov_b32_e32 v4, s7 6277; VI-NEXT: v_mov_b32_e32 v5, s6 6278; VI-NEXT: .LBB101_1: ; %atomicrmw.start 6279; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6280; VI-NEXT: s_waitcnt vmcnt(0) 6281; VI-NEXT: v_mov_b32_e32 v9, v1 6282; VI-NEXT: v_mov_b32_e32 v8, v0 6283; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] 6284; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 6285; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 6286; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 6287; VI-NEXT: s_waitcnt vmcnt(0) 6288; VI-NEXT: buffer_wbinvl1_vol 6289; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6290; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6291; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 6292; VI-NEXT: s_cbranch_execnz .LBB101_1 6293; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6294; VI-NEXT: s_or_b64 exec, exec, s[34:35] 6295; VI-NEXT: s_setpc_b64 s[30:31] 6296; 6297; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar: 6298; GFX9: ; %bb.0: 6299; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6300; GFX9-NEXT: v_mov_b32_e32 v2, 0 6301; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 6302; GFX9-NEXT: s_mov_b64 s[34:35], 0 6303; GFX9-NEXT: v_mov_b32_e32 v3, s7 6304; GFX9-NEXT: v_mov_b32_e32 v4, s6 6305; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start 6306; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6307; GFX9-NEXT: s_waitcnt vmcnt(0) 6308; GFX9-NEXT: v_mov_b32_e32 v8, v1 6309; GFX9-NEXT: v_mov_b32_e32 v7, v0 6310; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] 6311; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 6312; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 6313; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc 6314; GFX9-NEXT: s_waitcnt vmcnt(0) 6315; GFX9-NEXT: buffer_wbinvl1_vol 6316; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 6317; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6318; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 6319; GFX9-NEXT: s_cbranch_execnz .LBB101_1 6320; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6321; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 6322; GFX9-NEXT: s_setpc_b64 s[30:31] 6323 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 6324 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst 6325 ret i64 %result 6326} 6327 6328define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { 6329; SI-LABEL: atomic_umax_i64_addr64_offset: 6330; SI: ; %bb.0: ; %entry 6331; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 6332; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6333; SI-NEXT: s_waitcnt lgkmcnt(0) 6334; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 6335; SI-NEXT: s_add_u32 s4, s0, s4 6336; SI-NEXT: s_addc_u32 s5, s1, s5 6337; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 6338; SI-NEXT: s_mov_b64 s[0:1], 0 6339; SI-NEXT: s_mov_b32 s7, 0xf000 6340; SI-NEXT: v_mov_b32_e32 v4, s3 6341; SI-NEXT: v_mov_b32_e32 v5, s2 6342; SI-NEXT: s_waitcnt lgkmcnt(0) 6343; SI-NEXT: v_mov_b32_e32 v2, s8 6344; SI-NEXT: v_mov_b32_e32 v3, s9 6345; SI-NEXT: s_mov_b32 s6, -1 6346; SI-NEXT: .LBB102_1: ; %atomicrmw.start 6347; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6348; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] 6349; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 6350; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 6351; SI-NEXT: s_waitcnt expcnt(0) 6352; SI-NEXT: v_mov_b32_e32 v9, v3 6353; SI-NEXT: v_mov_b32_e32 v8, v2 6354; SI-NEXT: v_mov_b32_e32 v7, v1 6355; SI-NEXT: v_mov_b32_e32 v6, v0 6356; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 6357; SI-NEXT: s_waitcnt vmcnt(0) 6358; SI-NEXT: buffer_wbinvl1 6359; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 6360; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6361; SI-NEXT: v_mov_b32_e32 v2, v6 6362; SI-NEXT: v_mov_b32_e32 v3, v7 6363; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6364; SI-NEXT: s_cbranch_execnz .LBB102_1 6365; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6366; SI-NEXT: s_endpgm 6367; 6368; VI-LABEL: atomic_umax_i64_addr64_offset: 6369; VI: ; %bb.0: ; %entry 6370; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6371; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6372; VI-NEXT: s_mov_b64 s[4:5], 0 6373; VI-NEXT: s_waitcnt lgkmcnt(0) 6374; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 6375; VI-NEXT: s_add_u32 s0, s0, s6 6376; VI-NEXT: s_addc_u32 s1, s1, s7 6377; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 6378; VI-NEXT: s_add_u32 s0, s0, 32 6379; VI-NEXT: s_addc_u32 s1, s1, 0 6380; VI-NEXT: v_mov_b32_e32 v5, s1 6381; VI-NEXT: v_mov_b32_e32 v6, s3 6382; VI-NEXT: s_waitcnt lgkmcnt(0) 6383; VI-NEXT: v_mov_b32_e32 v2, s6 6384; VI-NEXT: v_mov_b32_e32 v7, s2 6385; VI-NEXT: v_mov_b32_e32 v3, s7 6386; VI-NEXT: v_mov_b32_e32 v4, s0 6387; VI-NEXT: .LBB102_1: ; %atomicrmw.start 6388; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6389; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] 6390; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 6391; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 6392; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 6393; VI-NEXT: s_waitcnt vmcnt(0) 6394; VI-NEXT: buffer_wbinvl1_vol 6395; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 6396; VI-NEXT: v_mov_b32_e32 v3, v1 6397; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6398; VI-NEXT: v_mov_b32_e32 v2, v0 6399; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6400; VI-NEXT: s_cbranch_execnz .LBB102_1 6401; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6402; VI-NEXT: s_endpgm 6403; 6404; GFX9-LABEL: atomic_umax_i64_addr64_offset: 6405; GFX9: ; %bb.0: ; %entry 6406; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6407; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6408; GFX9-NEXT: v_mov_b32_e32 v6, 0 6409; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6410; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 6411; GFX9-NEXT: s_add_u32 s0, s0, s4 6412; GFX9-NEXT: s_addc_u32 s1, s1, s5 6413; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 6414; GFX9-NEXT: s_mov_b64 s[4:5], 0 6415; GFX9-NEXT: v_mov_b32_e32 v4, s3 6416; GFX9-NEXT: v_mov_b32_e32 v5, s2 6417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6418; GFX9-NEXT: v_mov_b32_e32 v2, s6 6419; GFX9-NEXT: v_mov_b32_e32 v3, s7 6420; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start 6421; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6422; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] 6423; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 6424; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 6425; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc 6426; GFX9-NEXT: s_waitcnt vmcnt(0) 6427; GFX9-NEXT: buffer_wbinvl1_vol 6428; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 6429; GFX9-NEXT: v_mov_b32_e32 v3, v1 6430; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6431; GFX9-NEXT: v_mov_b32_e32 v2, v0 6432; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6433; GFX9-NEXT: s_cbranch_execnz .LBB102_1 6434; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6435; GFX9-NEXT: s_endpgm 6436entry: 6437 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 6438 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 6439 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst 6440 ret void 6441} 6442 6443define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { 6444; SI-LABEL: atomic_umax_i64_ret_addr64_offset: 6445; SI: ; %bb.0: ; %entry 6446; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 6447; SI-NEXT: s_waitcnt lgkmcnt(0) 6448; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 6449; SI-NEXT: s_add_u32 s8, s0, s6 6450; SI-NEXT: s_addc_u32 s9, s1, s7 6451; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 6452; SI-NEXT: s_mov_b64 s[0:1], 0 6453; SI-NEXT: s_mov_b32 s11, 0xf000 6454; SI-NEXT: v_mov_b32_e32 v8, s5 6455; SI-NEXT: v_mov_b32_e32 v9, s4 6456; SI-NEXT: s_waitcnt lgkmcnt(0) 6457; SI-NEXT: v_mov_b32_e32 v2, s6 6458; SI-NEXT: v_mov_b32_e32 v3, s7 6459; SI-NEXT: s_mov_b32 s10, -1 6460; SI-NEXT: .LBB103_1: ; %atomicrmw.start 6461; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6462; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 6463; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc 6464; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc 6465; SI-NEXT: s_waitcnt expcnt(0) 6466; SI-NEXT: v_mov_b32_e32 v7, v3 6467; SI-NEXT: v_mov_b32_e32 v6, v2 6468; SI-NEXT: v_mov_b32_e32 v5, v1 6469; SI-NEXT: v_mov_b32_e32 v4, v0 6470; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc 6471; SI-NEXT: s_waitcnt vmcnt(0) 6472; SI-NEXT: buffer_wbinvl1 6473; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 6474; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6475; SI-NEXT: v_mov_b32_e32 v2, v4 6476; SI-NEXT: v_mov_b32_e32 v3, v5 6477; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6478; SI-NEXT: s_cbranch_execnz .LBB103_1 6479; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6480; SI-NEXT: s_or_b64 exec, exec, s[0:1] 6481; SI-NEXT: s_mov_b32 s7, 0xf000 6482; SI-NEXT: s_mov_b32 s6, -1 6483; SI-NEXT: s_mov_b32 s4, s2 6484; SI-NEXT: s_mov_b32 s5, s3 6485; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 6486; SI-NEXT: s_endpgm 6487; 6488; VI-LABEL: atomic_umax_i64_ret_addr64_offset: 6489; VI: ; %bb.0: ; %entry 6490; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 6491; VI-NEXT: s_mov_b64 s[8:9], 0 6492; VI-NEXT: s_waitcnt lgkmcnt(0) 6493; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 6494; VI-NEXT: s_add_u32 s0, s0, s6 6495; VI-NEXT: s_addc_u32 s1, s1, s7 6496; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 6497; VI-NEXT: s_add_u32 s0, s0, 32 6498; VI-NEXT: s_addc_u32 s1, s1, 0 6499; VI-NEXT: v_mov_b32_e32 v0, s0 6500; VI-NEXT: v_mov_b32_e32 v4, s5 6501; VI-NEXT: s_waitcnt lgkmcnt(0) 6502; VI-NEXT: v_mov_b32_e32 v2, s6 6503; VI-NEXT: v_mov_b32_e32 v5, s4 6504; VI-NEXT: v_mov_b32_e32 v3, s7 6505; VI-NEXT: v_mov_b32_e32 v1, s1 6506; VI-NEXT: .LBB103_1: ; %atomicrmw.start 6507; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6508; VI-NEXT: v_mov_b32_e32 v9, v3 6509; VI-NEXT: v_mov_b32_e32 v8, v2 6510; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] 6511; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 6512; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 6513; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 6514; VI-NEXT: s_waitcnt vmcnt(0) 6515; VI-NEXT: buffer_wbinvl1_vol 6516; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 6517; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6518; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6519; VI-NEXT: s_cbranch_execnz .LBB103_1 6520; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6521; VI-NEXT: s_or_b64 exec, exec, s[8:9] 6522; VI-NEXT: v_mov_b32_e32 v0, s2 6523; VI-NEXT: v_mov_b32_e32 v1, s3 6524; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 6525; VI-NEXT: s_endpgm 6526; 6527; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: 6528; GFX9: ; %bb.0: ; %entry 6529; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 6530; GFX9-NEXT: s_mov_b64 s[2:3], 0 6531; GFX9-NEXT: v_mov_b32_e32 v4, 0 6532; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6533; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 6534; GFX9-NEXT: s_add_u32 s0, s8, s0 6535; GFX9-NEXT: s_addc_u32 s1, s9, s1 6536; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 6537; GFX9-NEXT: v_mov_b32_e32 v2, s13 6538; GFX9-NEXT: v_mov_b32_e32 v3, s12 6539; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6540; GFX9-NEXT: v_mov_b32_e32 v0, s4 6541; GFX9-NEXT: v_mov_b32_e32 v1, s5 6542; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start 6543; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6544; GFX9-NEXT: v_mov_b32_e32 v8, v1 6545; GFX9-NEXT: v_mov_b32_e32 v7, v0 6546; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] 6547; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 6548; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6549; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc 6550; GFX9-NEXT: s_waitcnt vmcnt(0) 6551; GFX9-NEXT: buffer_wbinvl1_vol 6552; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 6553; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 6554; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 6555; GFX9-NEXT: s_cbranch_execnz .LBB103_1 6556; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6557; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6558; GFX9-NEXT: v_mov_b32_e32 v2, 0 6559; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] 6560; GFX9-NEXT: s_endpgm 6561entry: 6562 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 6563 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 6564 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst 6565 store i64 %tmp0, ptr addrspace(1) %out2 6566 ret void 6567} 6568 6569define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { 6570; SI-LABEL: atomic_umax_i64_ret_addr64: 6571; SI: ; %bb.0: ; %entry 6572; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 6573; SI-NEXT: s_waitcnt lgkmcnt(0) 6574; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 6575; SI-NEXT: s_add_u32 s8, s0, s6 6576; SI-NEXT: s_addc_u32 s9, s1, s7 6577; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 6578; SI-NEXT: s_mov_b64 s[0:1], 0 6579; SI-NEXT: s_mov_b32 s11, 0xf000 6580; SI-NEXT: v_mov_b32_e32 v8, s5 6581; SI-NEXT: v_mov_b32_e32 v9, s4 6582; SI-NEXT: s_waitcnt lgkmcnt(0) 6583; SI-NEXT: v_mov_b32_e32 v2, s6 6584; SI-NEXT: v_mov_b32_e32 v3, s7 6585; SI-NEXT: s_mov_b32 s10, -1 6586; SI-NEXT: .LBB104_1: ; %atomicrmw.start 6587; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6588; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 6589; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc 6590; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc 6591; SI-NEXT: s_waitcnt expcnt(0) 6592; SI-NEXT: v_mov_b32_e32 v7, v3 6593; SI-NEXT: v_mov_b32_e32 v6, v2 6594; SI-NEXT: v_mov_b32_e32 v5, v1 6595; SI-NEXT: v_mov_b32_e32 v4, v0 6596; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc 6597; SI-NEXT: s_waitcnt vmcnt(0) 6598; SI-NEXT: buffer_wbinvl1 6599; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 6600; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6601; SI-NEXT: v_mov_b32_e32 v2, v4 6602; SI-NEXT: v_mov_b32_e32 v3, v5 6603; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6604; SI-NEXT: s_cbranch_execnz .LBB104_1 6605; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6606; SI-NEXT: s_or_b64 exec, exec, s[0:1] 6607; SI-NEXT: s_mov_b32 s7, 0xf000 6608; SI-NEXT: s_mov_b32 s6, -1 6609; SI-NEXT: s_mov_b32 s4, s2 6610; SI-NEXT: s_mov_b32 s5, s3 6611; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 6612; SI-NEXT: s_endpgm 6613; 6614; VI-LABEL: atomic_umax_i64_ret_addr64: 6615; VI: ; %bb.0: ; %entry 6616; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 6617; VI-NEXT: s_waitcnt lgkmcnt(0) 6618; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 6619; VI-NEXT: s_add_u32 s6, s0, s6 6620; VI-NEXT: s_addc_u32 s7, s1, s7 6621; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 6622; VI-NEXT: v_mov_b32_e32 v0, s6 6623; VI-NEXT: s_mov_b64 s[0:1], 0 6624; VI-NEXT: v_mov_b32_e32 v4, s5 6625; VI-NEXT: v_mov_b32_e32 v5, s4 6626; VI-NEXT: s_waitcnt lgkmcnt(0) 6627; VI-NEXT: v_mov_b32_e32 v2, s8 6628; VI-NEXT: v_mov_b32_e32 v3, s9 6629; VI-NEXT: v_mov_b32_e32 v1, s7 6630; VI-NEXT: .LBB104_1: ; %atomicrmw.start 6631; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6632; VI-NEXT: v_mov_b32_e32 v9, v3 6633; VI-NEXT: v_mov_b32_e32 v8, v2 6634; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] 6635; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 6636; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 6637; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 6638; VI-NEXT: s_waitcnt vmcnt(0) 6639; VI-NEXT: buffer_wbinvl1_vol 6640; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 6641; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6642; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 6643; VI-NEXT: s_cbranch_execnz .LBB104_1 6644; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6645; VI-NEXT: s_or_b64 exec, exec, s[0:1] 6646; VI-NEXT: v_mov_b32_e32 v0, s2 6647; VI-NEXT: v_mov_b32_e32 v1, s3 6648; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 6649; VI-NEXT: s_endpgm 6650; 6651; GFX9-LABEL: atomic_umax_i64_ret_addr64: 6652; GFX9: ; %bb.0: ; %entry 6653; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 6654; GFX9-NEXT: s_mov_b64 s[2:3], 0 6655; GFX9-NEXT: v_mov_b32_e32 v4, 0 6656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6657; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 6658; GFX9-NEXT: s_add_u32 s0, s8, s0 6659; GFX9-NEXT: s_addc_u32 s1, s9, s1 6660; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 6661; GFX9-NEXT: v_mov_b32_e32 v2, s13 6662; GFX9-NEXT: v_mov_b32_e32 v3, s12 6663; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6664; GFX9-NEXT: v_mov_b32_e32 v0, s4 6665; GFX9-NEXT: v_mov_b32_e32 v1, s5 6666; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start 6667; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6668; GFX9-NEXT: v_mov_b32_e32 v8, v1 6669; GFX9-NEXT: v_mov_b32_e32 v7, v0 6670; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] 6671; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 6672; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6673; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc 6674; GFX9-NEXT: s_waitcnt vmcnt(0) 6675; GFX9-NEXT: buffer_wbinvl1_vol 6676; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 6677; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 6678; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 6679; GFX9-NEXT: s_cbranch_execnz .LBB104_1 6680; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6681; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6682; GFX9-NEXT: v_mov_b32_e32 v2, 0 6683; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] 6684; GFX9-NEXT: s_endpgm 6685entry: 6686 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 6687 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst 6688 store i64 %tmp0, ptr addrspace(1) %out2 6689 ret void 6690} 6691 6692define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 6693; SI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: 6694; SI: ; %bb.0: 6695; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6696; SI-NEXT: s_mov_b32 s6, 0 6697; SI-NEXT: s_mov_b32 s7, 0xf000 6698; SI-NEXT: s_mov_b32 s4, s6 6699; SI-NEXT: s_mov_b32 s5, s6 6700; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 6701; SI-NEXT: s_mov_b64 s[8:9], 0 6702; SI-NEXT: .LBB105_1: ; %atomicrmw.start 6703; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6704; SI-NEXT: s_waitcnt vmcnt(0) 6705; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 6706; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6707; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6708; SI-NEXT: s_waitcnt expcnt(0) 6709; SI-NEXT: v_mov_b32_e32 v11, v7 6710; SI-NEXT: v_mov_b32_e32 v10, v6 6711; SI-NEXT: v_mov_b32_e32 v9, v5 6712; SI-NEXT: v_mov_b32_e32 v8, v4 6713; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 6714; SI-NEXT: s_waitcnt vmcnt(0) 6715; SI-NEXT: buffer_wbinvl1 6716; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 6717; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6718; SI-NEXT: v_mov_b32_e32 v6, v8 6719; SI-NEXT: v_mov_b32_e32 v7, v9 6720; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6721; SI-NEXT: s_cbranch_execnz .LBB105_1 6722; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6723; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6724; SI-NEXT: s_waitcnt expcnt(0) 6725; SI-NEXT: s_setpc_b64 s[30:31] 6726; 6727; VI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: 6728; VI: ; %bb.0: 6729; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6730; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 6731; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6732; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 6733; VI-NEXT: s_mov_b64 s[4:5], 0 6734; VI-NEXT: .LBB105_1: ; %atomicrmw.start 6735; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6736; VI-NEXT: s_waitcnt vmcnt(0) 6737; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 6738; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6739; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6740; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 6741; VI-NEXT: s_waitcnt vmcnt(0) 6742; VI-NEXT: buffer_wbinvl1_vol 6743; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 6744; VI-NEXT: v_mov_b32_e32 v7, v5 6745; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6746; VI-NEXT: v_mov_b32_e32 v6, v4 6747; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6748; VI-NEXT: s_cbranch_execnz .LBB105_1 6749; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6750; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6751; VI-NEXT: s_setpc_b64 s[30:31] 6752; 6753; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: 6754; GFX9: ; %bb.0: 6755; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6756; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 6757; GFX9-NEXT: s_mov_b64 s[4:5], 0 6758; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start 6759; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6760; GFX9-NEXT: s_waitcnt vmcnt(0) 6761; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 6762; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6763; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6764; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 6765; GFX9-NEXT: s_waitcnt vmcnt(0) 6766; GFX9-NEXT: buffer_wbinvl1_vol 6767; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 6768; GFX9-NEXT: v_mov_b32_e32 v7, v5 6769; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6770; GFX9-NEXT: v_mov_b32_e32 v6, v4 6771; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6772; GFX9-NEXT: s_cbranch_execnz .LBB105_1 6773; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6774; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6775; GFX9-NEXT: s_setpc_b64 s[30:31] 6776 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 6777 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 6778 ret void 6779} 6780 6781define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 6782; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: 6783; SI: ; %bb.0: 6784; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6785; SI-NEXT: v_mov_b32_e32 v5, v3 6786; SI-NEXT: v_mov_b32_e32 v4, v2 6787; SI-NEXT: v_mov_b32_e32 v7, v1 6788; SI-NEXT: v_mov_b32_e32 v6, v0 6789; SI-NEXT: s_mov_b32 s6, 0 6790; SI-NEXT: s_mov_b32 s7, 0xf000 6791; SI-NEXT: s_mov_b32 s4, s6 6792; SI-NEXT: s_mov_b32 s5, s6 6793; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 6794; SI-NEXT: s_mov_b64 s[8:9], 0 6795; SI-NEXT: .LBB106_1: ; %atomicrmw.start 6796; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6797; SI-NEXT: s_waitcnt vmcnt(0) 6798; SI-NEXT: v_mov_b32_e32 v11, v1 6799; SI-NEXT: v_mov_b32_e32 v10, v0 6800; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5] 6801; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 6802; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 6803; SI-NEXT: s_waitcnt expcnt(0) 6804; SI-NEXT: v_mov_b32_e32 v0, v8 6805; SI-NEXT: v_mov_b32_e32 v1, v9 6806; SI-NEXT: v_mov_b32_e32 v2, v10 6807; SI-NEXT: v_mov_b32_e32 v3, v11 6808; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 6809; SI-NEXT: s_waitcnt vmcnt(0) 6810; SI-NEXT: buffer_wbinvl1 6811; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 6812; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6813; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6814; SI-NEXT: s_cbranch_execnz .LBB106_1 6815; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6816; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6817; SI-NEXT: s_waitcnt expcnt(0) 6818; SI-NEXT: s_setpc_b64 s[30:31] 6819; 6820; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: 6821; VI: ; %bb.0: 6822; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6823; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 6824; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 6825; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 6826; VI-NEXT: s_mov_b64 s[4:5], 0 6827; VI-NEXT: .LBB106_1: ; %atomicrmw.start 6828; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6829; VI-NEXT: s_waitcnt vmcnt(0) 6830; VI-NEXT: v_mov_b32_e32 v9, v1 6831; VI-NEXT: v_mov_b32_e32 v8, v0 6832; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] 6833; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 6834; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 6835; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6836; VI-NEXT: s_waitcnt vmcnt(0) 6837; VI-NEXT: buffer_wbinvl1_vol 6838; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6839; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6840; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6841; VI-NEXT: s_cbranch_execnz .LBB106_1 6842; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6843; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6844; VI-NEXT: s_setpc_b64 s[30:31] 6845; 6846; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: 6847; GFX9: ; %bb.0: 6848; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6849; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 6850; GFX9-NEXT: s_mov_b64 s[4:5], 0 6851; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start 6852; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6853; GFX9-NEXT: s_waitcnt vmcnt(0) 6854; GFX9-NEXT: v_mov_b32_e32 v7, v5 6855; GFX9-NEXT: v_mov_b32_e32 v6, v4 6856; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] 6857; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6858; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6859; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 6860; GFX9-NEXT: s_waitcnt vmcnt(0) 6861; GFX9-NEXT: buffer_wbinvl1_vol 6862; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 6863; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6864; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6865; GFX9-NEXT: s_cbranch_execnz .LBB106_1 6866; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6867; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6868; GFX9-NEXT: v_mov_b32_e32 v0, v4 6869; GFX9-NEXT: v_mov_b32_e32 v1, v5 6870; GFX9-NEXT: s_setpc_b64 s[30:31] 6871 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 6872 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 6873 ret i64 %result 6874} 6875 6876; --------------------------------------------------------------------- 6877; atomicrmw umin 6878; --------------------------------------------------------------------- 6879 6880define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 6881; SI-LABEL: global_atomic_umin_i64_noret: 6882; SI: ; %bb.0: 6883; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6884; SI-NEXT: s_mov_b32 s6, 0 6885; SI-NEXT: s_mov_b32 s7, 0xf000 6886; SI-NEXT: s_mov_b32 s4, s6 6887; SI-NEXT: s_mov_b32 s5, s6 6888; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 6889; SI-NEXT: s_mov_b64 s[8:9], 0 6890; SI-NEXT: .LBB107_1: ; %atomicrmw.start 6891; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6892; SI-NEXT: s_waitcnt vmcnt(0) 6893; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 6894; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6895; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6896; SI-NEXT: s_waitcnt expcnt(0) 6897; SI-NEXT: v_mov_b32_e32 v11, v7 6898; SI-NEXT: v_mov_b32_e32 v10, v6 6899; SI-NEXT: v_mov_b32_e32 v9, v5 6900; SI-NEXT: v_mov_b32_e32 v8, v4 6901; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc 6902; SI-NEXT: s_waitcnt vmcnt(0) 6903; SI-NEXT: buffer_wbinvl1 6904; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 6905; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6906; SI-NEXT: v_mov_b32_e32 v6, v8 6907; SI-NEXT: v_mov_b32_e32 v7, v9 6908; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6909; SI-NEXT: s_cbranch_execnz .LBB107_1 6910; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6911; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6912; SI-NEXT: s_waitcnt expcnt(0) 6913; SI-NEXT: s_setpc_b64 s[30:31] 6914; 6915; VI-LABEL: global_atomic_umin_i64_noret: 6916; VI: ; %bb.0: 6917; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6918; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 6919; VI-NEXT: s_mov_b64 s[4:5], 0 6920; VI-NEXT: .LBB107_1: ; %atomicrmw.start 6921; VI-NEXT: ; =>This Inner Loop Header: Depth=1 6922; VI-NEXT: s_waitcnt vmcnt(0) 6923; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 6924; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6925; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6926; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 6927; VI-NEXT: s_waitcnt vmcnt(0) 6928; VI-NEXT: buffer_wbinvl1_vol 6929; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 6930; VI-NEXT: v_mov_b32_e32 v7, v5 6931; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6932; VI-NEXT: v_mov_b32_e32 v6, v4 6933; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 6934; VI-NEXT: s_cbranch_execnz .LBB107_1 6935; VI-NEXT: ; %bb.2: ; %atomicrmw.end 6936; VI-NEXT: s_or_b64 exec, exec, s[4:5] 6937; VI-NEXT: s_setpc_b64 s[30:31] 6938; 6939; GFX9-LABEL: global_atomic_umin_i64_noret: 6940; GFX9: ; %bb.0: 6941; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6942; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 6943; GFX9-NEXT: s_mov_b64 s[4:5], 0 6944; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start 6945; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 6946; GFX9-NEXT: s_waitcnt vmcnt(0) 6947; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 6948; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6949; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6950; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 6951; GFX9-NEXT: s_waitcnt vmcnt(0) 6952; GFX9-NEXT: buffer_wbinvl1_vol 6953; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 6954; GFX9-NEXT: v_mov_b32_e32 v7, v5 6955; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6956; GFX9-NEXT: v_mov_b32_e32 v6, v4 6957; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 6958; GFX9-NEXT: s_cbranch_execnz .LBB107_1 6959; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 6960; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 6961; GFX9-NEXT: s_setpc_b64 s[30:31] 6962 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst 6963 ret void 6964} 6965 6966define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 6967; SI-LABEL: global_atomic_umin_i64_noret_offset: 6968; SI: ; %bb.0: 6969; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6970; SI-NEXT: s_mov_b32 s6, 0 6971; SI-NEXT: s_mov_b32 s7, 0xf000 6972; SI-NEXT: s_mov_b32 s4, s6 6973; SI-NEXT: s_mov_b32 s5, s6 6974; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 6975; SI-NEXT: s_mov_b64 s[8:9], 0 6976; SI-NEXT: .LBB108_1: ; %atomicrmw.start 6977; SI-NEXT: ; =>This Inner Loop Header: Depth=1 6978; SI-NEXT: s_waitcnt vmcnt(0) 6979; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 6980; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 6981; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 6982; SI-NEXT: s_waitcnt expcnt(0) 6983; SI-NEXT: v_mov_b32_e32 v11, v7 6984; SI-NEXT: v_mov_b32_e32 v10, v6 6985; SI-NEXT: v_mov_b32_e32 v9, v5 6986; SI-NEXT: v_mov_b32_e32 v8, v4 6987; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 6988; SI-NEXT: s_waitcnt vmcnt(0) 6989; SI-NEXT: buffer_wbinvl1 6990; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 6991; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 6992; SI-NEXT: v_mov_b32_e32 v6, v8 6993; SI-NEXT: v_mov_b32_e32 v7, v9 6994; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 6995; SI-NEXT: s_cbranch_execnz .LBB108_1 6996; SI-NEXT: ; %bb.2: ; %atomicrmw.end 6997; SI-NEXT: s_or_b64 exec, exec, s[8:9] 6998; SI-NEXT: s_waitcnt expcnt(0) 6999; SI-NEXT: s_setpc_b64 s[30:31] 7000; 7001; VI-LABEL: global_atomic_umin_i64_noret_offset: 7002; VI: ; %bb.0: 7003; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7004; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 7005; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7006; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7007; VI-NEXT: s_mov_b64 s[4:5], 0 7008; VI-NEXT: .LBB108_1: ; %atomicrmw.start 7009; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7010; VI-NEXT: s_waitcnt vmcnt(0) 7011; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7012; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7013; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7014; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7015; VI-NEXT: s_waitcnt vmcnt(0) 7016; VI-NEXT: buffer_wbinvl1_vol 7017; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7018; VI-NEXT: v_mov_b32_e32 v7, v5 7019; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7020; VI-NEXT: v_mov_b32_e32 v6, v4 7021; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7022; VI-NEXT: s_cbranch_execnz .LBB108_1 7023; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7024; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7025; VI-NEXT: s_setpc_b64 s[30:31] 7026; 7027; GFX9-LABEL: global_atomic_umin_i64_noret_offset: 7028; GFX9: ; %bb.0: 7029; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7030; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 7031; GFX9-NEXT: s_mov_b64 s[4:5], 0 7032; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start 7033; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7034; GFX9-NEXT: s_waitcnt vmcnt(0) 7035; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7036; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7037; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7038; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 7039; GFX9-NEXT: s_waitcnt vmcnt(0) 7040; GFX9-NEXT: buffer_wbinvl1_vol 7041; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7042; GFX9-NEXT: v_mov_b32_e32 v7, v5 7043; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7044; GFX9-NEXT: v_mov_b32_e32 v6, v4 7045; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7046; GFX9-NEXT: s_cbranch_execnz .LBB108_1 7047; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7048; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7049; GFX9-NEXT: s_setpc_b64 s[30:31] 7050 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 7051 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst 7052 ret void 7053} 7054 7055define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 7056; SI-LABEL: global_atomic_umin_i64_ret: 7057; SI: ; %bb.0: 7058; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7059; SI-NEXT: v_mov_b32_e32 v5, v3 7060; SI-NEXT: v_mov_b32_e32 v4, v2 7061; SI-NEXT: v_mov_b32_e32 v7, v1 7062; SI-NEXT: v_mov_b32_e32 v6, v0 7063; SI-NEXT: s_mov_b32 s6, 0 7064; SI-NEXT: s_mov_b32 s7, 0xf000 7065; SI-NEXT: s_mov_b32 s4, s6 7066; SI-NEXT: s_mov_b32 s5, s6 7067; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 7068; SI-NEXT: s_mov_b64 s[8:9], 0 7069; SI-NEXT: .LBB109_1: ; %atomicrmw.start 7070; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7071; SI-NEXT: s_waitcnt vmcnt(0) 7072; SI-NEXT: v_mov_b32_e32 v11, v1 7073; SI-NEXT: v_mov_b32_e32 v10, v0 7074; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5] 7075; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 7076; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 7077; SI-NEXT: s_waitcnt expcnt(0) 7078; SI-NEXT: v_mov_b32_e32 v0, v8 7079; SI-NEXT: v_mov_b32_e32 v1, v9 7080; SI-NEXT: v_mov_b32_e32 v2, v10 7081; SI-NEXT: v_mov_b32_e32 v3, v11 7082; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc 7083; SI-NEXT: s_waitcnt vmcnt(0) 7084; SI-NEXT: buffer_wbinvl1 7085; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 7086; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7087; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7088; SI-NEXT: s_cbranch_execnz .LBB109_1 7089; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7090; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7091; SI-NEXT: s_waitcnt expcnt(0) 7092; SI-NEXT: s_setpc_b64 s[30:31] 7093; 7094; VI-LABEL: global_atomic_umin_i64_ret: 7095; VI: ; %bb.0: 7096; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7097; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 7098; VI-NEXT: s_mov_b64 s[4:5], 0 7099; VI-NEXT: .LBB109_1: ; %atomicrmw.start 7100; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7101; VI-NEXT: s_waitcnt vmcnt(0) 7102; VI-NEXT: v_mov_b32_e32 v7, v5 7103; VI-NEXT: v_mov_b32_e32 v6, v4 7104; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7105; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7106; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7107; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7108; VI-NEXT: s_waitcnt vmcnt(0) 7109; VI-NEXT: buffer_wbinvl1_vol 7110; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7111; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7112; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7113; VI-NEXT: s_cbranch_execnz .LBB109_1 7114; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7115; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7116; VI-NEXT: v_mov_b32_e32 v0, v4 7117; VI-NEXT: v_mov_b32_e32 v1, v5 7118; VI-NEXT: s_setpc_b64 s[30:31] 7119; 7120; GFX9-LABEL: global_atomic_umin_i64_ret: 7121; GFX9: ; %bb.0: 7122; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7123; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 7124; GFX9-NEXT: s_mov_b64 s[4:5], 0 7125; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start 7126; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7127; GFX9-NEXT: s_waitcnt vmcnt(0) 7128; GFX9-NEXT: v_mov_b32_e32 v7, v5 7129; GFX9-NEXT: v_mov_b32_e32 v6, v4 7130; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7131; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7132; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7133; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 7134; GFX9-NEXT: s_waitcnt vmcnt(0) 7135; GFX9-NEXT: buffer_wbinvl1_vol 7136; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7137; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7138; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7139; GFX9-NEXT: s_cbranch_execnz .LBB109_1 7140; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7141; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7142; GFX9-NEXT: v_mov_b32_e32 v0, v4 7143; GFX9-NEXT: v_mov_b32_e32 v1, v5 7144; GFX9-NEXT: s_setpc_b64 s[30:31] 7145 %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst 7146 ret i64 %result 7147} 7148 7149define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 7150; SI-LABEL: global_atomic_umin_i64_ret_offset: 7151; SI: ; %bb.0: 7152; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7153; SI-NEXT: v_mov_b32_e32 v5, v3 7154; SI-NEXT: v_mov_b32_e32 v4, v2 7155; SI-NEXT: v_mov_b32_e32 v7, v1 7156; SI-NEXT: v_mov_b32_e32 v6, v0 7157; SI-NEXT: s_mov_b32 s6, 0 7158; SI-NEXT: s_mov_b32 s7, 0xf000 7159; SI-NEXT: s_mov_b32 s4, s6 7160; SI-NEXT: s_mov_b32 s5, s6 7161; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 7162; SI-NEXT: s_mov_b64 s[8:9], 0 7163; SI-NEXT: .LBB110_1: ; %atomicrmw.start 7164; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7165; SI-NEXT: s_waitcnt vmcnt(0) 7166; SI-NEXT: v_mov_b32_e32 v11, v1 7167; SI-NEXT: v_mov_b32_e32 v10, v0 7168; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5] 7169; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 7170; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 7171; SI-NEXT: s_waitcnt expcnt(0) 7172; SI-NEXT: v_mov_b32_e32 v0, v8 7173; SI-NEXT: v_mov_b32_e32 v1, v9 7174; SI-NEXT: v_mov_b32_e32 v2, v10 7175; SI-NEXT: v_mov_b32_e32 v3, v11 7176; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 7177; SI-NEXT: s_waitcnt vmcnt(0) 7178; SI-NEXT: buffer_wbinvl1 7179; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 7180; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7181; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7182; SI-NEXT: s_cbranch_execnz .LBB110_1 7183; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7184; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7185; SI-NEXT: s_waitcnt expcnt(0) 7186; SI-NEXT: s_setpc_b64 s[30:31] 7187; 7188; VI-LABEL: global_atomic_umin_i64_ret_offset: 7189; VI: ; %bb.0: 7190; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7191; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 7192; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 7193; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 7194; VI-NEXT: s_mov_b64 s[4:5], 0 7195; VI-NEXT: .LBB110_1: ; %atomicrmw.start 7196; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7197; VI-NEXT: s_waitcnt vmcnt(0) 7198; VI-NEXT: v_mov_b32_e32 v9, v1 7199; VI-NEXT: v_mov_b32_e32 v8, v0 7200; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] 7201; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 7202; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 7203; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 7204; VI-NEXT: s_waitcnt vmcnt(0) 7205; VI-NEXT: buffer_wbinvl1_vol 7206; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 7207; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7208; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7209; VI-NEXT: s_cbranch_execnz .LBB110_1 7210; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7211; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7212; VI-NEXT: s_setpc_b64 s[30:31] 7213; 7214; GFX9-LABEL: global_atomic_umin_i64_ret_offset: 7215; GFX9: ; %bb.0: 7216; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7217; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 7218; GFX9-NEXT: s_mov_b64 s[4:5], 0 7219; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start 7220; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7221; GFX9-NEXT: s_waitcnt vmcnt(0) 7222; GFX9-NEXT: v_mov_b32_e32 v7, v5 7223; GFX9-NEXT: v_mov_b32_e32 v6, v4 7224; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7225; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7226; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7227; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 7228; GFX9-NEXT: s_waitcnt vmcnt(0) 7229; GFX9-NEXT: buffer_wbinvl1_vol 7230; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7231; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7232; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7233; GFX9-NEXT: s_cbranch_execnz .LBB110_1 7234; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7235; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7236; GFX9-NEXT: v_mov_b32_e32 v0, v4 7237; GFX9-NEXT: v_mov_b32_e32 v1, v5 7238; GFX9-NEXT: s_setpc_b64 s[30:31] 7239 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 7240 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst 7241 ret i64 %result 7242} 7243 7244define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 7245; SI-LABEL: global_atomic_umin_i64_noret_scalar: 7246; SI: ; %bb.0: 7247; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7248; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7249; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 7250; SI-NEXT: s_mov_b64 exec, s[34:35] 7251; SI-NEXT: s_waitcnt expcnt(0) 7252; SI-NEXT: v_writelane_b32 v10, s6, 0 7253; SI-NEXT: v_writelane_b32 v10, s7, 1 7254; SI-NEXT: s_mov_b32 s35, s7 7255; SI-NEXT: s_mov_b32 s34, s6 7256; SI-NEXT: s_mov_b32 s7, 0xf000 7257; SI-NEXT: s_mov_b32 s6, -1 7258; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 7259; SI-NEXT: s_mov_b64 s[36:37], 0 7260; SI-NEXT: v_mov_b32_e32 v4, s35 7261; SI-NEXT: v_mov_b32_e32 v5, s34 7262; SI-NEXT: .LBB111_1: ; %atomicrmw.start 7263; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7264; SI-NEXT: s_waitcnt vmcnt(0) 7265; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] 7266; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 7267; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 7268; SI-NEXT: s_waitcnt expcnt(0) 7269; SI-NEXT: v_mov_b32_e32 v9, v3 7270; SI-NEXT: v_mov_b32_e32 v8, v2 7271; SI-NEXT: v_mov_b32_e32 v7, v1 7272; SI-NEXT: v_mov_b32_e32 v6, v0 7273; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc 7274; SI-NEXT: s_waitcnt vmcnt(0) 7275; SI-NEXT: buffer_wbinvl1 7276; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 7277; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7278; SI-NEXT: v_mov_b32_e32 v2, v6 7279; SI-NEXT: v_mov_b32_e32 v3, v7 7280; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7281; SI-NEXT: s_cbranch_execnz .LBB111_1 7282; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7283; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7284; SI-NEXT: v_readlane_b32 s7, v10, 1 7285; SI-NEXT: v_readlane_b32 s6, v10, 0 7286; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7287; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 7288; SI-NEXT: s_mov_b64 exec, s[34:35] 7289; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7290; SI-NEXT: s_setpc_b64 s[30:31] 7291; 7292; VI-LABEL: global_atomic_umin_i64_noret_scalar: 7293; VI: ; %bb.0: 7294; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7295; VI-NEXT: v_mov_b32_e32 v0, s4 7296; VI-NEXT: v_mov_b32_e32 v1, s5 7297; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 7298; VI-NEXT: v_mov_b32_e32 v4, s4 7299; VI-NEXT: s_mov_b64 s[34:35], 0 7300; VI-NEXT: v_mov_b32_e32 v6, s7 7301; VI-NEXT: v_mov_b32_e32 v7, s6 7302; VI-NEXT: v_mov_b32_e32 v5, s5 7303; VI-NEXT: .LBB111_1: ; %atomicrmw.start 7304; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7305; VI-NEXT: s_waitcnt vmcnt(0) 7306; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] 7307; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 7308; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 7309; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 7310; VI-NEXT: s_waitcnt vmcnt(0) 7311; VI-NEXT: buffer_wbinvl1_vol 7312; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 7313; VI-NEXT: v_mov_b32_e32 v3, v1 7314; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7315; VI-NEXT: v_mov_b32_e32 v2, v0 7316; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7317; VI-NEXT: s_cbranch_execnz .LBB111_1 7318; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7319; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7320; VI-NEXT: s_setpc_b64 s[30:31] 7321; 7322; GFX9-LABEL: global_atomic_umin_i64_noret_scalar: 7323; GFX9: ; %bb.0: 7324; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7325; GFX9-NEXT: v_mov_b32_e32 v4, 0 7326; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 7327; GFX9-NEXT: s_mov_b64 s[34:35], 0 7328; GFX9-NEXT: v_mov_b32_e32 v5, s7 7329; GFX9-NEXT: v_mov_b32_e32 v6, s6 7330; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start 7331; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7332; GFX9-NEXT: s_waitcnt vmcnt(0) 7333; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] 7334; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 7335; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 7336; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc 7337; GFX9-NEXT: s_waitcnt vmcnt(0) 7338; GFX9-NEXT: buffer_wbinvl1_vol 7339; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 7340; GFX9-NEXT: v_mov_b32_e32 v3, v1 7341; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7342; GFX9-NEXT: v_mov_b32_e32 v2, v0 7343; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7344; GFX9-NEXT: s_cbranch_execnz .LBB111_1 7345; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7346; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7347; GFX9-NEXT: s_setpc_b64 s[30:31] 7348 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst 7349 ret void 7350} 7351 7352define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 7353; SI-LABEL: global_atomic_umin_i64_noret_offset_scalar: 7354; SI: ; %bb.0: 7355; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7356; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7357; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 7358; SI-NEXT: s_mov_b64 exec, s[34:35] 7359; SI-NEXT: s_waitcnt expcnt(0) 7360; SI-NEXT: v_writelane_b32 v10, s6, 0 7361; SI-NEXT: v_writelane_b32 v10, s7, 1 7362; SI-NEXT: s_mov_b32 s35, s7 7363; SI-NEXT: s_mov_b32 s34, s6 7364; SI-NEXT: s_mov_b32 s7, 0xf000 7365; SI-NEXT: s_mov_b32 s6, -1 7366; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 7367; SI-NEXT: s_mov_b64 s[36:37], 0 7368; SI-NEXT: v_mov_b32_e32 v4, s35 7369; SI-NEXT: v_mov_b32_e32 v5, s34 7370; SI-NEXT: .LBB112_1: ; %atomicrmw.start 7371; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7372; SI-NEXT: s_waitcnt vmcnt(0) 7373; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] 7374; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 7375; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 7376; SI-NEXT: s_waitcnt expcnt(0) 7377; SI-NEXT: v_mov_b32_e32 v9, v3 7378; SI-NEXT: v_mov_b32_e32 v8, v2 7379; SI-NEXT: v_mov_b32_e32 v7, v1 7380; SI-NEXT: v_mov_b32_e32 v6, v0 7381; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 7382; SI-NEXT: s_waitcnt vmcnt(0) 7383; SI-NEXT: buffer_wbinvl1 7384; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 7385; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7386; SI-NEXT: v_mov_b32_e32 v2, v6 7387; SI-NEXT: v_mov_b32_e32 v3, v7 7388; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7389; SI-NEXT: s_cbranch_execnz .LBB112_1 7390; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7391; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7392; SI-NEXT: v_readlane_b32 s7, v10, 1 7393; SI-NEXT: v_readlane_b32 s6, v10, 0 7394; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7395; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 7396; SI-NEXT: s_mov_b64 exec, s[34:35] 7397; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7398; SI-NEXT: s_setpc_b64 s[30:31] 7399; 7400; VI-LABEL: global_atomic_umin_i64_noret_offset_scalar: 7401; VI: ; %bb.0: 7402; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7403; VI-NEXT: s_add_u32 s34, s4, 32 7404; VI-NEXT: s_addc_u32 s35, s5, 0 7405; VI-NEXT: v_mov_b32_e32 v4, s34 7406; VI-NEXT: v_mov_b32_e32 v5, s35 7407; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] 7408; VI-NEXT: s_mov_b64 s[34:35], 0 7409; VI-NEXT: v_mov_b32_e32 v6, s7 7410; VI-NEXT: v_mov_b32_e32 v7, s6 7411; VI-NEXT: .LBB112_1: ; %atomicrmw.start 7412; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7413; VI-NEXT: s_waitcnt vmcnt(0) 7414; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] 7415; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 7416; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 7417; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 7418; VI-NEXT: s_waitcnt vmcnt(0) 7419; VI-NEXT: buffer_wbinvl1_vol 7420; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 7421; VI-NEXT: v_mov_b32_e32 v3, v1 7422; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7423; VI-NEXT: v_mov_b32_e32 v2, v0 7424; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7425; VI-NEXT: s_cbranch_execnz .LBB112_1 7426; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7427; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7428; VI-NEXT: s_setpc_b64 s[30:31] 7429; 7430; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar: 7431; GFX9: ; %bb.0: 7432; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7433; GFX9-NEXT: v_mov_b32_e32 v4, 0 7434; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 7435; GFX9-NEXT: s_mov_b64 s[34:35], 0 7436; GFX9-NEXT: v_mov_b32_e32 v5, s7 7437; GFX9-NEXT: v_mov_b32_e32 v6, s6 7438; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start 7439; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7440; GFX9-NEXT: s_waitcnt vmcnt(0) 7441; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] 7442; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 7443; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 7444; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc 7445; GFX9-NEXT: s_waitcnt vmcnt(0) 7446; GFX9-NEXT: buffer_wbinvl1_vol 7447; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 7448; GFX9-NEXT: v_mov_b32_e32 v3, v1 7449; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7450; GFX9-NEXT: v_mov_b32_e32 v2, v0 7451; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7452; GFX9-NEXT: s_cbranch_execnz .LBB112_1 7453; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7454; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7455; GFX9-NEXT: s_setpc_b64 s[30:31] 7456 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 7457 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst 7458 ret void 7459} 7460 7461define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 7462; SI-LABEL: global_atomic_umin_i64_ret_scalar: 7463; SI: ; %bb.0: 7464; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7465; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7466; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 7467; SI-NEXT: s_mov_b64 exec, s[34:35] 7468; SI-NEXT: s_waitcnt expcnt(0) 7469; SI-NEXT: v_writelane_b32 v10, s6, 0 7470; SI-NEXT: v_writelane_b32 v10, s7, 1 7471; SI-NEXT: s_mov_b32 s35, s7 7472; SI-NEXT: s_mov_b32 s34, s6 7473; SI-NEXT: s_mov_b32 s7, 0xf000 7474; SI-NEXT: s_mov_b32 s6, -1 7475; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 7476; SI-NEXT: s_mov_b64 s[36:37], 0 7477; SI-NEXT: v_mov_b32_e32 v4, s35 7478; SI-NEXT: v_mov_b32_e32 v5, s34 7479; SI-NEXT: .LBB113_1: ; %atomicrmw.start 7480; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7481; SI-NEXT: s_waitcnt vmcnt(0) 7482; SI-NEXT: v_mov_b32_e32 v9, v1 7483; SI-NEXT: v_mov_b32_e32 v8, v0 7484; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] 7485; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 7486; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 7487; SI-NEXT: s_waitcnt expcnt(0) 7488; SI-NEXT: v_mov_b32_e32 v0, v6 7489; SI-NEXT: v_mov_b32_e32 v1, v7 7490; SI-NEXT: v_mov_b32_e32 v2, v8 7491; SI-NEXT: v_mov_b32_e32 v3, v9 7492; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc 7493; SI-NEXT: s_waitcnt vmcnt(0) 7494; SI-NEXT: buffer_wbinvl1 7495; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 7496; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7497; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7498; SI-NEXT: s_cbranch_execnz .LBB113_1 7499; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7500; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7501; SI-NEXT: v_readlane_b32 s7, v10, 1 7502; SI-NEXT: v_readlane_b32 s6, v10, 0 7503; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7504; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 7505; SI-NEXT: s_mov_b64 exec, s[34:35] 7506; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7507; SI-NEXT: s_setpc_b64 s[30:31] 7508; 7509; VI-LABEL: global_atomic_umin_i64_ret_scalar: 7510; VI: ; %bb.0: 7511; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7512; VI-NEXT: v_mov_b32_e32 v0, s4 7513; VI-NEXT: v_mov_b32_e32 v1, s5 7514; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 7515; VI-NEXT: v_mov_b32_e32 v2, s4 7516; VI-NEXT: s_mov_b64 s[34:35], 0 7517; VI-NEXT: v_mov_b32_e32 v4, s7 7518; VI-NEXT: v_mov_b32_e32 v5, s6 7519; VI-NEXT: v_mov_b32_e32 v3, s5 7520; VI-NEXT: .LBB113_1: ; %atomicrmw.start 7521; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7522; VI-NEXT: s_waitcnt vmcnt(0) 7523; VI-NEXT: v_mov_b32_e32 v9, v1 7524; VI-NEXT: v_mov_b32_e32 v8, v0 7525; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] 7526; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 7527; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 7528; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 7529; VI-NEXT: s_waitcnt vmcnt(0) 7530; VI-NEXT: buffer_wbinvl1_vol 7531; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 7532; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7533; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7534; VI-NEXT: s_cbranch_execnz .LBB113_1 7535; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7536; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7537; VI-NEXT: s_setpc_b64 s[30:31] 7538; 7539; GFX9-LABEL: global_atomic_umin_i64_ret_scalar: 7540; GFX9: ; %bb.0: 7541; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7542; GFX9-NEXT: v_mov_b32_e32 v2, 0 7543; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] 7544; GFX9-NEXT: s_mov_b64 s[34:35], 0 7545; GFX9-NEXT: v_mov_b32_e32 v3, s7 7546; GFX9-NEXT: v_mov_b32_e32 v4, s6 7547; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start 7548; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7549; GFX9-NEXT: s_waitcnt vmcnt(0) 7550; GFX9-NEXT: v_mov_b32_e32 v8, v1 7551; GFX9-NEXT: v_mov_b32_e32 v7, v0 7552; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] 7553; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 7554; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 7555; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc 7556; GFX9-NEXT: s_waitcnt vmcnt(0) 7557; GFX9-NEXT: buffer_wbinvl1_vol 7558; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 7559; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7560; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7561; GFX9-NEXT: s_cbranch_execnz .LBB113_1 7562; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7563; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7564; GFX9-NEXT: s_setpc_b64 s[30:31] 7565 %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst 7566 ret i64 %result 7567} 7568 7569define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 7570; SI-LABEL: global_atomic_umin_i64_ret_offset_scalar: 7571; SI: ; %bb.0: 7572; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7573; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7574; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 7575; SI-NEXT: s_mov_b64 exec, s[34:35] 7576; SI-NEXT: s_waitcnt expcnt(0) 7577; SI-NEXT: v_writelane_b32 v10, s6, 0 7578; SI-NEXT: v_writelane_b32 v10, s7, 1 7579; SI-NEXT: s_mov_b32 s35, s7 7580; SI-NEXT: s_mov_b32 s34, s6 7581; SI-NEXT: s_mov_b32 s7, 0xf000 7582; SI-NEXT: s_mov_b32 s6, -1 7583; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 7584; SI-NEXT: s_mov_b64 s[36:37], 0 7585; SI-NEXT: v_mov_b32_e32 v4, s35 7586; SI-NEXT: v_mov_b32_e32 v5, s34 7587; SI-NEXT: .LBB114_1: ; %atomicrmw.start 7588; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7589; SI-NEXT: s_waitcnt vmcnt(0) 7590; SI-NEXT: v_mov_b32_e32 v9, v1 7591; SI-NEXT: v_mov_b32_e32 v8, v0 7592; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] 7593; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 7594; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 7595; SI-NEXT: s_waitcnt expcnt(0) 7596; SI-NEXT: v_mov_b32_e32 v0, v6 7597; SI-NEXT: v_mov_b32_e32 v1, v7 7598; SI-NEXT: v_mov_b32_e32 v2, v8 7599; SI-NEXT: v_mov_b32_e32 v3, v9 7600; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc 7601; SI-NEXT: s_waitcnt vmcnt(0) 7602; SI-NEXT: buffer_wbinvl1 7603; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 7604; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 7605; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 7606; SI-NEXT: s_cbranch_execnz .LBB114_1 7607; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7608; SI-NEXT: s_or_b64 exec, exec, s[36:37] 7609; SI-NEXT: v_readlane_b32 s7, v10, 1 7610; SI-NEXT: v_readlane_b32 s6, v10, 0 7611; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 7612; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 7613; SI-NEXT: s_mov_b64 exec, s[34:35] 7614; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7615; SI-NEXT: s_setpc_b64 s[30:31] 7616; 7617; VI-LABEL: global_atomic_umin_i64_ret_offset_scalar: 7618; VI: ; %bb.0: 7619; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7620; VI-NEXT: s_add_u32 s34, s4, 32 7621; VI-NEXT: s_addc_u32 s35, s5, 0 7622; VI-NEXT: v_mov_b32_e32 v2, s34 7623; VI-NEXT: v_mov_b32_e32 v3, s35 7624; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] 7625; VI-NEXT: s_mov_b64 s[34:35], 0 7626; VI-NEXT: v_mov_b32_e32 v4, s7 7627; VI-NEXT: v_mov_b32_e32 v5, s6 7628; VI-NEXT: .LBB114_1: ; %atomicrmw.start 7629; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7630; VI-NEXT: s_waitcnt vmcnt(0) 7631; VI-NEXT: v_mov_b32_e32 v9, v1 7632; VI-NEXT: v_mov_b32_e32 v8, v0 7633; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] 7634; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 7635; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 7636; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 7637; VI-NEXT: s_waitcnt vmcnt(0) 7638; VI-NEXT: buffer_wbinvl1_vol 7639; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 7640; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7641; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 7642; VI-NEXT: s_cbranch_execnz .LBB114_1 7643; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7644; VI-NEXT: s_or_b64 exec, exec, s[34:35] 7645; VI-NEXT: s_setpc_b64 s[30:31] 7646; 7647; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar: 7648; GFX9: ; %bb.0: 7649; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7650; GFX9-NEXT: v_mov_b32_e32 v2, 0 7651; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 7652; GFX9-NEXT: s_mov_b64 s[34:35], 0 7653; GFX9-NEXT: v_mov_b32_e32 v3, s7 7654; GFX9-NEXT: v_mov_b32_e32 v4, s6 7655; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start 7656; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7657; GFX9-NEXT: s_waitcnt vmcnt(0) 7658; GFX9-NEXT: v_mov_b32_e32 v8, v1 7659; GFX9-NEXT: v_mov_b32_e32 v7, v0 7660; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] 7661; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 7662; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 7663; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc 7664; GFX9-NEXT: s_waitcnt vmcnt(0) 7665; GFX9-NEXT: buffer_wbinvl1_vol 7666; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 7667; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 7668; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 7669; GFX9-NEXT: s_cbranch_execnz .LBB114_1 7670; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7671; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 7672; GFX9-NEXT: s_setpc_b64 s[30:31] 7673 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 7674 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst 7675 ret i64 %result 7676} 7677 7678define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 7679; SI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: 7680; SI: ; %bb.0: 7681; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7682; SI-NEXT: s_mov_b32 s6, 0 7683; SI-NEXT: s_mov_b32 s7, 0xf000 7684; SI-NEXT: s_mov_b32 s4, s6 7685; SI-NEXT: s_mov_b32 s5, s6 7686; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 7687; SI-NEXT: s_mov_b64 s[8:9], 0 7688; SI-NEXT: .LBB115_1: ; %atomicrmw.start 7689; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7690; SI-NEXT: s_waitcnt vmcnt(0) 7691; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7692; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7693; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7694; SI-NEXT: s_waitcnt expcnt(0) 7695; SI-NEXT: v_mov_b32_e32 v11, v7 7696; SI-NEXT: v_mov_b32_e32 v10, v6 7697; SI-NEXT: v_mov_b32_e32 v9, v5 7698; SI-NEXT: v_mov_b32_e32 v8, v4 7699; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 7700; SI-NEXT: s_waitcnt vmcnt(0) 7701; SI-NEXT: buffer_wbinvl1 7702; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 7703; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7704; SI-NEXT: v_mov_b32_e32 v6, v8 7705; SI-NEXT: v_mov_b32_e32 v7, v9 7706; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7707; SI-NEXT: s_cbranch_execnz .LBB115_1 7708; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7709; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7710; SI-NEXT: s_waitcnt expcnt(0) 7711; SI-NEXT: s_setpc_b64 s[30:31] 7712; 7713; VI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: 7714; VI: ; %bb.0: 7715; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7716; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 7717; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7718; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7719; VI-NEXT: s_mov_b64 s[4:5], 0 7720; VI-NEXT: .LBB115_1: ; %atomicrmw.start 7721; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7722; VI-NEXT: s_waitcnt vmcnt(0) 7723; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7724; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7725; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7726; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7727; VI-NEXT: s_waitcnt vmcnt(0) 7728; VI-NEXT: buffer_wbinvl1_vol 7729; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7730; VI-NEXT: v_mov_b32_e32 v7, v5 7731; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7732; VI-NEXT: v_mov_b32_e32 v6, v4 7733; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7734; VI-NEXT: s_cbranch_execnz .LBB115_1 7735; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7736; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7737; VI-NEXT: s_setpc_b64 s[30:31] 7738; 7739; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: 7740; GFX9: ; %bb.0: 7741; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7742; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 7743; GFX9-NEXT: s_mov_b64 s[4:5], 0 7744; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start 7745; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7746; GFX9-NEXT: s_waitcnt vmcnt(0) 7747; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7748; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7749; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7750; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 7751; GFX9-NEXT: s_waitcnt vmcnt(0) 7752; GFX9-NEXT: buffer_wbinvl1_vol 7753; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7754; GFX9-NEXT: v_mov_b32_e32 v7, v5 7755; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7756; GFX9-NEXT: v_mov_b32_e32 v6, v4 7757; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7758; GFX9-NEXT: s_cbranch_execnz .LBB115_1 7759; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7760; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7761; GFX9-NEXT: s_setpc_b64 s[30:31] 7762 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 7763 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 7764 ret void 7765} 7766 7767define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 7768; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: 7769; SI: ; %bb.0: 7770; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7771; SI-NEXT: v_mov_b32_e32 v5, v3 7772; SI-NEXT: v_mov_b32_e32 v4, v2 7773; SI-NEXT: v_mov_b32_e32 v7, v1 7774; SI-NEXT: v_mov_b32_e32 v6, v0 7775; SI-NEXT: s_mov_b32 s6, 0 7776; SI-NEXT: s_mov_b32 s7, 0xf000 7777; SI-NEXT: s_mov_b32 s4, s6 7778; SI-NEXT: s_mov_b32 s5, s6 7779; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 7780; SI-NEXT: s_mov_b64 s[8:9], 0 7781; SI-NEXT: .LBB116_1: ; %atomicrmw.start 7782; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7783; SI-NEXT: s_waitcnt vmcnt(0) 7784; SI-NEXT: v_mov_b32_e32 v11, v1 7785; SI-NEXT: v_mov_b32_e32 v10, v0 7786; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5] 7787; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 7788; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 7789; SI-NEXT: s_waitcnt expcnt(0) 7790; SI-NEXT: v_mov_b32_e32 v0, v8 7791; SI-NEXT: v_mov_b32_e32 v1, v9 7792; SI-NEXT: v_mov_b32_e32 v2, v10 7793; SI-NEXT: v_mov_b32_e32 v3, v11 7794; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 7795; SI-NEXT: s_waitcnt vmcnt(0) 7796; SI-NEXT: buffer_wbinvl1 7797; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 7798; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7799; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7800; SI-NEXT: s_cbranch_execnz .LBB116_1 7801; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7802; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7803; SI-NEXT: s_waitcnt expcnt(0) 7804; SI-NEXT: s_setpc_b64 s[30:31] 7805; 7806; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: 7807; VI: ; %bb.0: 7808; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7809; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 7810; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 7811; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 7812; VI-NEXT: s_mov_b64 s[4:5], 0 7813; VI-NEXT: .LBB116_1: ; %atomicrmw.start 7814; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7815; VI-NEXT: s_waitcnt vmcnt(0) 7816; VI-NEXT: v_mov_b32_e32 v9, v1 7817; VI-NEXT: v_mov_b32_e32 v8, v0 7818; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] 7819; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 7820; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 7821; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 7822; VI-NEXT: s_waitcnt vmcnt(0) 7823; VI-NEXT: buffer_wbinvl1_vol 7824; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 7825; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7826; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7827; VI-NEXT: s_cbranch_execnz .LBB116_1 7828; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7829; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7830; VI-NEXT: s_setpc_b64 s[30:31] 7831; 7832; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: 7833; GFX9: ; %bb.0: 7834; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7835; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 7836; GFX9-NEXT: s_mov_b64 s[4:5], 0 7837; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start 7838; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7839; GFX9-NEXT: s_waitcnt vmcnt(0) 7840; GFX9-NEXT: v_mov_b32_e32 v7, v5 7841; GFX9-NEXT: v_mov_b32_e32 v6, v4 7842; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] 7843; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7844; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7845; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 7846; GFX9-NEXT: s_waitcnt vmcnt(0) 7847; GFX9-NEXT: buffer_wbinvl1_vol 7848; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7849; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7850; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7851; GFX9-NEXT: s_cbranch_execnz .LBB116_1 7852; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7853; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7854; GFX9-NEXT: v_mov_b32_e32 v0, v4 7855; GFX9-NEXT: v_mov_b32_e32 v1, v5 7856; GFX9-NEXT: s_setpc_b64 s[30:31] 7857 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 7858 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 7859 ret i64 %result 7860} 7861 7862; --------------------------------------------------------------------- 7863; atomicrmw min 7864; --------------------------------------------------------------------- 7865 7866define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 7867; SI-LABEL: global_atomic_min_i64_noret: 7868; SI: ; %bb.0: 7869; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7870; SI-NEXT: s_mov_b32 s6, 0 7871; SI-NEXT: s_mov_b32 s7, 0xf000 7872; SI-NEXT: s_mov_b32 s4, s6 7873; SI-NEXT: s_mov_b32 s5, s6 7874; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 7875; SI-NEXT: s_mov_b64 s[8:9], 0 7876; SI-NEXT: .LBB117_1: ; %atomicrmw.start 7877; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7878; SI-NEXT: s_waitcnt vmcnt(0) 7879; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 7880; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7881; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7882; SI-NEXT: s_waitcnt expcnt(0) 7883; SI-NEXT: v_mov_b32_e32 v11, v7 7884; SI-NEXT: v_mov_b32_e32 v10, v6 7885; SI-NEXT: v_mov_b32_e32 v9, v5 7886; SI-NEXT: v_mov_b32_e32 v8, v4 7887; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc 7888; SI-NEXT: s_waitcnt vmcnt(0) 7889; SI-NEXT: buffer_wbinvl1 7890; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 7891; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7892; SI-NEXT: v_mov_b32_e32 v6, v8 7893; SI-NEXT: v_mov_b32_e32 v7, v9 7894; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7895; SI-NEXT: s_cbranch_execnz .LBB117_1 7896; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7897; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7898; SI-NEXT: s_waitcnt expcnt(0) 7899; SI-NEXT: s_setpc_b64 s[30:31] 7900; 7901; VI-LABEL: global_atomic_min_i64_noret: 7902; VI: ; %bb.0: 7903; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7904; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7905; VI-NEXT: s_mov_b64 s[4:5], 0 7906; VI-NEXT: .LBB117_1: ; %atomicrmw.start 7907; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7908; VI-NEXT: s_waitcnt vmcnt(0) 7909; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 7910; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7911; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7912; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7913; VI-NEXT: s_waitcnt vmcnt(0) 7914; VI-NEXT: buffer_wbinvl1_vol 7915; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7916; VI-NEXT: v_mov_b32_e32 v7, v5 7917; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7918; VI-NEXT: v_mov_b32_e32 v6, v4 7919; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 7920; VI-NEXT: s_cbranch_execnz .LBB117_1 7921; VI-NEXT: ; %bb.2: ; %atomicrmw.end 7922; VI-NEXT: s_or_b64 exec, exec, s[4:5] 7923; VI-NEXT: s_setpc_b64 s[30:31] 7924; 7925; GFX9-LABEL: global_atomic_min_i64_noret: 7926; GFX9: ; %bb.0: 7927; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7928; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 7929; GFX9-NEXT: s_mov_b64 s[4:5], 0 7930; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start 7931; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 7932; GFX9-NEXT: s_waitcnt vmcnt(0) 7933; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 7934; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7935; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7936; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 7937; GFX9-NEXT: s_waitcnt vmcnt(0) 7938; GFX9-NEXT: buffer_wbinvl1_vol 7939; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7940; GFX9-NEXT: v_mov_b32_e32 v7, v5 7941; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7942; GFX9-NEXT: v_mov_b32_e32 v6, v4 7943; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 7944; GFX9-NEXT: s_cbranch_execnz .LBB117_1 7945; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 7946; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 7947; GFX9-NEXT: s_setpc_b64 s[30:31] 7948 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst 7949 ret void 7950} 7951 7952define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 7953; SI-LABEL: global_atomic_min_i64_noret_offset: 7954; SI: ; %bb.0: 7955; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7956; SI-NEXT: s_mov_b32 s6, 0 7957; SI-NEXT: s_mov_b32 s7, 0xf000 7958; SI-NEXT: s_mov_b32 s4, s6 7959; SI-NEXT: s_mov_b32 s5, s6 7960; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 7961; SI-NEXT: s_mov_b64 s[8:9], 0 7962; SI-NEXT: .LBB118_1: ; %atomicrmw.start 7963; SI-NEXT: ; =>This Inner Loop Header: Depth=1 7964; SI-NEXT: s_waitcnt vmcnt(0) 7965; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 7966; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7967; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 7968; SI-NEXT: s_waitcnt expcnt(0) 7969; SI-NEXT: v_mov_b32_e32 v11, v7 7970; SI-NEXT: v_mov_b32_e32 v10, v6 7971; SI-NEXT: v_mov_b32_e32 v9, v5 7972; SI-NEXT: v_mov_b32_e32 v8, v4 7973; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 7974; SI-NEXT: s_waitcnt vmcnt(0) 7975; SI-NEXT: buffer_wbinvl1 7976; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 7977; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 7978; SI-NEXT: v_mov_b32_e32 v6, v8 7979; SI-NEXT: v_mov_b32_e32 v7, v9 7980; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 7981; SI-NEXT: s_cbranch_execnz .LBB118_1 7982; SI-NEXT: ; %bb.2: ; %atomicrmw.end 7983; SI-NEXT: s_or_b64 exec, exec, s[8:9] 7984; SI-NEXT: s_waitcnt expcnt(0) 7985; SI-NEXT: s_setpc_b64 s[30:31] 7986; 7987; VI-LABEL: global_atomic_min_i64_noret_offset: 7988; VI: ; %bb.0: 7989; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7990; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 7991; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7992; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7993; VI-NEXT: s_mov_b64 s[4:5], 0 7994; VI-NEXT: .LBB118_1: ; %atomicrmw.start 7995; VI-NEXT: ; =>This Inner Loop Header: Depth=1 7996; VI-NEXT: s_waitcnt vmcnt(0) 7997; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 7998; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 7999; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 8000; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 8001; VI-NEXT: s_waitcnt vmcnt(0) 8002; VI-NEXT: buffer_wbinvl1_vol 8003; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8004; VI-NEXT: v_mov_b32_e32 v7, v5 8005; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8006; VI-NEXT: v_mov_b32_e32 v6, v4 8007; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8008; VI-NEXT: s_cbranch_execnz .LBB118_1 8009; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8010; VI-NEXT: s_or_b64 exec, exec, s[4:5] 8011; VI-NEXT: s_setpc_b64 s[30:31] 8012; 8013; GFX9-LABEL: global_atomic_min_i64_noret_offset: 8014; GFX9: ; %bb.0: 8015; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8016; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 8017; GFX9-NEXT: s_mov_b64 s[4:5], 0 8018; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start 8019; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8020; GFX9-NEXT: s_waitcnt vmcnt(0) 8021; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 8022; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 8023; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 8024; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 8025; GFX9-NEXT: s_waitcnt vmcnt(0) 8026; GFX9-NEXT: buffer_wbinvl1_vol 8027; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8028; GFX9-NEXT: v_mov_b32_e32 v7, v5 8029; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8030; GFX9-NEXT: v_mov_b32_e32 v6, v4 8031; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8032; GFX9-NEXT: s_cbranch_execnz .LBB118_1 8033; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8034; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8035; GFX9-NEXT: s_setpc_b64 s[30:31] 8036 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 8037 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst 8038 ret void 8039} 8040 8041define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 8042; SI-LABEL: global_atomic_min_i64_ret: 8043; SI: ; %bb.0: 8044; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8045; SI-NEXT: v_mov_b32_e32 v5, v3 8046; SI-NEXT: v_mov_b32_e32 v4, v2 8047; SI-NEXT: v_mov_b32_e32 v7, v1 8048; SI-NEXT: v_mov_b32_e32 v6, v0 8049; SI-NEXT: s_mov_b32 s6, 0 8050; SI-NEXT: s_mov_b32 s7, 0xf000 8051; SI-NEXT: s_mov_b32 s4, s6 8052; SI-NEXT: s_mov_b32 s5, s6 8053; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 8054; SI-NEXT: s_mov_b64 s[8:9], 0 8055; SI-NEXT: .LBB119_1: ; %atomicrmw.start 8056; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8057; SI-NEXT: s_waitcnt vmcnt(0) 8058; SI-NEXT: v_mov_b32_e32 v11, v1 8059; SI-NEXT: v_mov_b32_e32 v10, v0 8060; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5] 8061; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 8062; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 8063; SI-NEXT: s_waitcnt expcnt(0) 8064; SI-NEXT: v_mov_b32_e32 v0, v8 8065; SI-NEXT: v_mov_b32_e32 v1, v9 8066; SI-NEXT: v_mov_b32_e32 v2, v10 8067; SI-NEXT: v_mov_b32_e32 v3, v11 8068; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc 8069; SI-NEXT: s_waitcnt vmcnt(0) 8070; SI-NEXT: buffer_wbinvl1 8071; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 8072; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 8073; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 8074; SI-NEXT: s_cbranch_execnz .LBB119_1 8075; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8076; SI-NEXT: s_or_b64 exec, exec, s[8:9] 8077; SI-NEXT: s_waitcnt expcnt(0) 8078; SI-NEXT: s_setpc_b64 s[30:31] 8079; 8080; VI-LABEL: global_atomic_min_i64_ret: 8081; VI: ; %bb.0: 8082; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8083; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 8084; VI-NEXT: s_mov_b64 s[4:5], 0 8085; VI-NEXT: .LBB119_1: ; %atomicrmw.start 8086; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8087; VI-NEXT: s_waitcnt vmcnt(0) 8088; VI-NEXT: v_mov_b32_e32 v7, v5 8089; VI-NEXT: v_mov_b32_e32 v6, v4 8090; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 8091; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 8092; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 8093; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 8094; VI-NEXT: s_waitcnt vmcnt(0) 8095; VI-NEXT: buffer_wbinvl1_vol 8096; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8097; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8098; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8099; VI-NEXT: s_cbranch_execnz .LBB119_1 8100; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8101; VI-NEXT: s_or_b64 exec, exec, s[4:5] 8102; VI-NEXT: v_mov_b32_e32 v0, v4 8103; VI-NEXT: v_mov_b32_e32 v1, v5 8104; VI-NEXT: s_setpc_b64 s[30:31] 8105; 8106; GFX9-LABEL: global_atomic_min_i64_ret: 8107; GFX9: ; %bb.0: 8108; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8109; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 8110; GFX9-NEXT: s_mov_b64 s[4:5], 0 8111; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start 8112; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8113; GFX9-NEXT: s_waitcnt vmcnt(0) 8114; GFX9-NEXT: v_mov_b32_e32 v7, v5 8115; GFX9-NEXT: v_mov_b32_e32 v6, v4 8116; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 8117; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 8118; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 8119; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 8120; GFX9-NEXT: s_waitcnt vmcnt(0) 8121; GFX9-NEXT: buffer_wbinvl1_vol 8122; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8123; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8124; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8125; GFX9-NEXT: s_cbranch_execnz .LBB119_1 8126; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8127; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8128; GFX9-NEXT: v_mov_b32_e32 v0, v4 8129; GFX9-NEXT: v_mov_b32_e32 v1, v5 8130; GFX9-NEXT: s_setpc_b64 s[30:31] 8131 %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst 8132 ret i64 %result 8133} 8134 8135define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 8136; SI-LABEL: global_atomic_min_i64_ret_offset: 8137; SI: ; %bb.0: 8138; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8139; SI-NEXT: v_mov_b32_e32 v5, v3 8140; SI-NEXT: v_mov_b32_e32 v4, v2 8141; SI-NEXT: v_mov_b32_e32 v7, v1 8142; SI-NEXT: v_mov_b32_e32 v6, v0 8143; SI-NEXT: s_mov_b32 s6, 0 8144; SI-NEXT: s_mov_b32 s7, 0xf000 8145; SI-NEXT: s_mov_b32 s4, s6 8146; SI-NEXT: s_mov_b32 s5, s6 8147; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 8148; SI-NEXT: s_mov_b64 s[8:9], 0 8149; SI-NEXT: .LBB120_1: ; %atomicrmw.start 8150; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8151; SI-NEXT: s_waitcnt vmcnt(0) 8152; SI-NEXT: v_mov_b32_e32 v11, v1 8153; SI-NEXT: v_mov_b32_e32 v10, v0 8154; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5] 8155; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 8156; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 8157; SI-NEXT: s_waitcnt expcnt(0) 8158; SI-NEXT: v_mov_b32_e32 v0, v8 8159; SI-NEXT: v_mov_b32_e32 v1, v9 8160; SI-NEXT: v_mov_b32_e32 v2, v10 8161; SI-NEXT: v_mov_b32_e32 v3, v11 8162; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 8163; SI-NEXT: s_waitcnt vmcnt(0) 8164; SI-NEXT: buffer_wbinvl1 8165; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 8166; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 8167; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 8168; SI-NEXT: s_cbranch_execnz .LBB120_1 8169; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8170; SI-NEXT: s_or_b64 exec, exec, s[8:9] 8171; SI-NEXT: s_waitcnt expcnt(0) 8172; SI-NEXT: s_setpc_b64 s[30:31] 8173; 8174; VI-LABEL: global_atomic_min_i64_ret_offset: 8175; VI: ; %bb.0: 8176; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8177; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 8178; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 8179; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 8180; VI-NEXT: s_mov_b64 s[4:5], 0 8181; VI-NEXT: .LBB120_1: ; %atomicrmw.start 8182; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8183; VI-NEXT: s_waitcnt vmcnt(0) 8184; VI-NEXT: v_mov_b32_e32 v9, v1 8185; VI-NEXT: v_mov_b32_e32 v8, v0 8186; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] 8187; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 8188; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 8189; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 8190; VI-NEXT: s_waitcnt vmcnt(0) 8191; VI-NEXT: buffer_wbinvl1_vol 8192; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 8193; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8194; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8195; VI-NEXT: s_cbranch_execnz .LBB120_1 8196; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8197; VI-NEXT: s_or_b64 exec, exec, s[4:5] 8198; VI-NEXT: s_setpc_b64 s[30:31] 8199; 8200; GFX9-LABEL: global_atomic_min_i64_ret_offset: 8201; GFX9: ; %bb.0: 8202; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8203; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 8204; GFX9-NEXT: s_mov_b64 s[4:5], 0 8205; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start 8206; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8207; GFX9-NEXT: s_waitcnt vmcnt(0) 8208; GFX9-NEXT: v_mov_b32_e32 v7, v5 8209; GFX9-NEXT: v_mov_b32_e32 v6, v4 8210; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 8211; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 8212; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 8213; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 8214; GFX9-NEXT: s_waitcnt vmcnt(0) 8215; GFX9-NEXT: buffer_wbinvl1_vol 8216; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8217; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8218; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8219; GFX9-NEXT: s_cbranch_execnz .LBB120_1 8220; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8221; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 8222; GFX9-NEXT: v_mov_b32_e32 v0, v4 8223; GFX9-NEXT: v_mov_b32_e32 v1, v5 8224; GFX9-NEXT: s_setpc_b64 s[30:31] 8225 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 8226 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst 8227 ret i64 %result 8228} 8229 8230define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 8231; SI-LABEL: global_atomic_min_i64_noret_scalar: 8232; SI: ; %bb.0: 8233; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8234; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8235; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 8236; SI-NEXT: s_mov_b64 exec, s[34:35] 8237; SI-NEXT: s_waitcnt expcnt(0) 8238; SI-NEXT: v_writelane_b32 v10, s6, 0 8239; SI-NEXT: v_writelane_b32 v10, s7, 1 8240; SI-NEXT: s_mov_b32 s35, s7 8241; SI-NEXT: s_mov_b32 s34, s6 8242; SI-NEXT: s_mov_b32 s7, 0xf000 8243; SI-NEXT: s_mov_b32 s6, -1 8244; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 8245; SI-NEXT: s_mov_b64 s[36:37], 0 8246; SI-NEXT: v_mov_b32_e32 v4, s35 8247; SI-NEXT: v_mov_b32_e32 v5, s34 8248; SI-NEXT: .LBB121_1: ; %atomicrmw.start 8249; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8250; SI-NEXT: s_waitcnt vmcnt(0) 8251; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] 8252; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8253; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 8254; SI-NEXT: s_waitcnt expcnt(0) 8255; SI-NEXT: v_mov_b32_e32 v9, v3 8256; SI-NEXT: v_mov_b32_e32 v8, v2 8257; SI-NEXT: v_mov_b32_e32 v7, v1 8258; SI-NEXT: v_mov_b32_e32 v6, v0 8259; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc 8260; SI-NEXT: s_waitcnt vmcnt(0) 8261; SI-NEXT: buffer_wbinvl1 8262; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 8263; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 8264; SI-NEXT: v_mov_b32_e32 v2, v6 8265; SI-NEXT: v_mov_b32_e32 v3, v7 8266; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 8267; SI-NEXT: s_cbranch_execnz .LBB121_1 8268; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8269; SI-NEXT: s_or_b64 exec, exec, s[36:37] 8270; SI-NEXT: v_readlane_b32 s7, v10, 1 8271; SI-NEXT: v_readlane_b32 s6, v10, 0 8272; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8273; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 8274; SI-NEXT: s_mov_b64 exec, s[34:35] 8275; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8276; SI-NEXT: s_setpc_b64 s[30:31] 8277; 8278; VI-LABEL: global_atomic_min_i64_noret_scalar: 8279; VI: ; %bb.0: 8280; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8281; VI-NEXT: v_mov_b32_e32 v0, s4 8282; VI-NEXT: v_mov_b32_e32 v1, s5 8283; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 8284; VI-NEXT: v_mov_b32_e32 v4, s4 8285; VI-NEXT: s_mov_b64 s[34:35], 0 8286; VI-NEXT: v_mov_b32_e32 v6, s7 8287; VI-NEXT: v_mov_b32_e32 v7, s6 8288; VI-NEXT: v_mov_b32_e32 v5, s5 8289; VI-NEXT: .LBB121_1: ; %atomicrmw.start 8290; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8291; VI-NEXT: s_waitcnt vmcnt(0) 8292; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] 8293; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 8294; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 8295; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 8296; VI-NEXT: s_waitcnt vmcnt(0) 8297; VI-NEXT: buffer_wbinvl1_vol 8298; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8299; VI-NEXT: v_mov_b32_e32 v3, v1 8300; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8301; VI-NEXT: v_mov_b32_e32 v2, v0 8302; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 8303; VI-NEXT: s_cbranch_execnz .LBB121_1 8304; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8305; VI-NEXT: s_or_b64 exec, exec, s[34:35] 8306; VI-NEXT: s_setpc_b64 s[30:31] 8307; 8308; GFX9-LABEL: global_atomic_min_i64_noret_scalar: 8309; GFX9: ; %bb.0: 8310; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8311; GFX9-NEXT: v_mov_b32_e32 v4, 0 8312; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 8313; GFX9-NEXT: s_mov_b64 s[34:35], 0 8314; GFX9-NEXT: v_mov_b32_e32 v5, s7 8315; GFX9-NEXT: v_mov_b32_e32 v6, s6 8316; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start 8317; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8318; GFX9-NEXT: s_waitcnt vmcnt(0) 8319; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] 8320; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 8321; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 8322; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc 8323; GFX9-NEXT: s_waitcnt vmcnt(0) 8324; GFX9-NEXT: buffer_wbinvl1_vol 8325; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8326; GFX9-NEXT: v_mov_b32_e32 v3, v1 8327; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8328; GFX9-NEXT: v_mov_b32_e32 v2, v0 8329; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 8330; GFX9-NEXT: s_cbranch_execnz .LBB121_1 8331; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8332; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 8333; GFX9-NEXT: s_setpc_b64 s[30:31] 8334 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst 8335 ret void 8336} 8337 8338define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 8339; SI-LABEL: global_atomic_min_i64_noret_offset_scalar: 8340; SI: ; %bb.0: 8341; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8342; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8343; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 8344; SI-NEXT: s_mov_b64 exec, s[34:35] 8345; SI-NEXT: s_waitcnt expcnt(0) 8346; SI-NEXT: v_writelane_b32 v10, s6, 0 8347; SI-NEXT: v_writelane_b32 v10, s7, 1 8348; SI-NEXT: s_mov_b32 s35, s7 8349; SI-NEXT: s_mov_b32 s34, s6 8350; SI-NEXT: s_mov_b32 s7, 0xf000 8351; SI-NEXT: s_mov_b32 s6, -1 8352; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 8353; SI-NEXT: s_mov_b64 s[36:37], 0 8354; SI-NEXT: v_mov_b32_e32 v4, s35 8355; SI-NEXT: v_mov_b32_e32 v5, s34 8356; SI-NEXT: .LBB122_1: ; %atomicrmw.start 8357; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8358; SI-NEXT: s_waitcnt vmcnt(0) 8359; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] 8360; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8361; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 8362; SI-NEXT: s_waitcnt expcnt(0) 8363; SI-NEXT: v_mov_b32_e32 v9, v3 8364; SI-NEXT: v_mov_b32_e32 v8, v2 8365; SI-NEXT: v_mov_b32_e32 v7, v1 8366; SI-NEXT: v_mov_b32_e32 v6, v0 8367; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 8368; SI-NEXT: s_waitcnt vmcnt(0) 8369; SI-NEXT: buffer_wbinvl1 8370; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 8371; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 8372; SI-NEXT: v_mov_b32_e32 v2, v6 8373; SI-NEXT: v_mov_b32_e32 v3, v7 8374; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 8375; SI-NEXT: s_cbranch_execnz .LBB122_1 8376; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8377; SI-NEXT: s_or_b64 exec, exec, s[36:37] 8378; SI-NEXT: v_readlane_b32 s7, v10, 1 8379; SI-NEXT: v_readlane_b32 s6, v10, 0 8380; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8381; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 8382; SI-NEXT: s_mov_b64 exec, s[34:35] 8383; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8384; SI-NEXT: s_setpc_b64 s[30:31] 8385; 8386; VI-LABEL: global_atomic_min_i64_noret_offset_scalar: 8387; VI: ; %bb.0: 8388; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8389; VI-NEXT: s_add_u32 s34, s4, 32 8390; VI-NEXT: s_addc_u32 s35, s5, 0 8391; VI-NEXT: v_mov_b32_e32 v4, s34 8392; VI-NEXT: v_mov_b32_e32 v5, s35 8393; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] 8394; VI-NEXT: s_mov_b64 s[34:35], 0 8395; VI-NEXT: v_mov_b32_e32 v6, s7 8396; VI-NEXT: v_mov_b32_e32 v7, s6 8397; VI-NEXT: .LBB122_1: ; %atomicrmw.start 8398; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8399; VI-NEXT: s_waitcnt vmcnt(0) 8400; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] 8401; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 8402; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 8403; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 8404; VI-NEXT: s_waitcnt vmcnt(0) 8405; VI-NEXT: buffer_wbinvl1_vol 8406; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8407; VI-NEXT: v_mov_b32_e32 v3, v1 8408; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8409; VI-NEXT: v_mov_b32_e32 v2, v0 8410; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 8411; VI-NEXT: s_cbranch_execnz .LBB122_1 8412; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8413; VI-NEXT: s_or_b64 exec, exec, s[34:35] 8414; VI-NEXT: s_setpc_b64 s[30:31] 8415; 8416; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar: 8417; GFX9: ; %bb.0: 8418; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8419; GFX9-NEXT: v_mov_b32_e32 v4, 0 8420; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 8421; GFX9-NEXT: s_mov_b64 s[34:35], 0 8422; GFX9-NEXT: v_mov_b32_e32 v5, s7 8423; GFX9-NEXT: v_mov_b32_e32 v6, s6 8424; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start 8425; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8426; GFX9-NEXT: s_waitcnt vmcnt(0) 8427; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] 8428; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 8429; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc 8430; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc 8431; GFX9-NEXT: s_waitcnt vmcnt(0) 8432; GFX9-NEXT: buffer_wbinvl1_vol 8433; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8434; GFX9-NEXT: v_mov_b32_e32 v3, v1 8435; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8436; GFX9-NEXT: v_mov_b32_e32 v2, v0 8437; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 8438; GFX9-NEXT: s_cbranch_execnz .LBB122_1 8439; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8440; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 8441; GFX9-NEXT: s_setpc_b64 s[30:31] 8442 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 8443 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst 8444 ret void 8445} 8446 8447define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 8448; SI-LABEL: global_atomic_min_i64_ret_scalar: 8449; SI: ; %bb.0: 8450; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8451; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8452; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 8453; SI-NEXT: s_mov_b64 exec, s[34:35] 8454; SI-NEXT: s_waitcnt expcnt(0) 8455; SI-NEXT: v_writelane_b32 v10, s6, 0 8456; SI-NEXT: v_writelane_b32 v10, s7, 1 8457; SI-NEXT: s_mov_b32 s35, s7 8458; SI-NEXT: s_mov_b32 s34, s6 8459; SI-NEXT: s_mov_b32 s7, 0xf000 8460; SI-NEXT: s_mov_b32 s6, -1 8461; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 8462; SI-NEXT: s_mov_b64 s[36:37], 0 8463; SI-NEXT: v_mov_b32_e32 v4, s35 8464; SI-NEXT: v_mov_b32_e32 v5, s34 8465; SI-NEXT: .LBB123_1: ; %atomicrmw.start 8466; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8467; SI-NEXT: s_waitcnt vmcnt(0) 8468; SI-NEXT: v_mov_b32_e32 v9, v1 8469; SI-NEXT: v_mov_b32_e32 v8, v0 8470; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] 8471; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 8472; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 8473; SI-NEXT: s_waitcnt expcnt(0) 8474; SI-NEXT: v_mov_b32_e32 v0, v6 8475; SI-NEXT: v_mov_b32_e32 v1, v7 8476; SI-NEXT: v_mov_b32_e32 v2, v8 8477; SI-NEXT: v_mov_b32_e32 v3, v9 8478; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc 8479; SI-NEXT: s_waitcnt vmcnt(0) 8480; SI-NEXT: buffer_wbinvl1 8481; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 8482; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 8483; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 8484; SI-NEXT: s_cbranch_execnz .LBB123_1 8485; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8486; SI-NEXT: s_or_b64 exec, exec, s[36:37] 8487; SI-NEXT: v_readlane_b32 s7, v10, 1 8488; SI-NEXT: v_readlane_b32 s6, v10, 0 8489; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8490; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 8491; SI-NEXT: s_mov_b64 exec, s[34:35] 8492; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8493; SI-NEXT: s_setpc_b64 s[30:31] 8494; 8495; VI-LABEL: global_atomic_min_i64_ret_scalar: 8496; VI: ; %bb.0: 8497; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8498; VI-NEXT: v_mov_b32_e32 v0, s4 8499; VI-NEXT: v_mov_b32_e32 v1, s5 8500; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 8501; VI-NEXT: v_mov_b32_e32 v2, s4 8502; VI-NEXT: s_mov_b64 s[34:35], 0 8503; VI-NEXT: v_mov_b32_e32 v4, s7 8504; VI-NEXT: v_mov_b32_e32 v5, s6 8505; VI-NEXT: v_mov_b32_e32 v3, s5 8506; VI-NEXT: .LBB123_1: ; %atomicrmw.start 8507; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8508; VI-NEXT: s_waitcnt vmcnt(0) 8509; VI-NEXT: v_mov_b32_e32 v9, v1 8510; VI-NEXT: v_mov_b32_e32 v8, v0 8511; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] 8512; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 8513; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 8514; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 8515; VI-NEXT: s_waitcnt vmcnt(0) 8516; VI-NEXT: buffer_wbinvl1_vol 8517; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 8518; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8519; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 8520; VI-NEXT: s_cbranch_execnz .LBB123_1 8521; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8522; VI-NEXT: s_or_b64 exec, exec, s[34:35] 8523; VI-NEXT: s_setpc_b64 s[30:31] 8524; 8525; GFX9-LABEL: global_atomic_min_i64_ret_scalar: 8526; GFX9: ; %bb.0: 8527; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8528; GFX9-NEXT: v_mov_b32_e32 v2, 0 8529; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] 8530; GFX9-NEXT: s_mov_b64 s[34:35], 0 8531; GFX9-NEXT: v_mov_b32_e32 v3, s7 8532; GFX9-NEXT: v_mov_b32_e32 v4, s6 8533; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start 8534; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8535; GFX9-NEXT: s_waitcnt vmcnt(0) 8536; GFX9-NEXT: v_mov_b32_e32 v8, v1 8537; GFX9-NEXT: v_mov_b32_e32 v7, v0 8538; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] 8539; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 8540; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 8541; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc 8542; GFX9-NEXT: s_waitcnt vmcnt(0) 8543; GFX9-NEXT: buffer_wbinvl1_vol 8544; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 8545; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8546; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 8547; GFX9-NEXT: s_cbranch_execnz .LBB123_1 8548; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8549; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 8550; GFX9-NEXT: s_setpc_b64 s[30:31] 8551 %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst 8552 ret i64 %result 8553} 8554 8555define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 8556; SI-LABEL: global_atomic_min_i64_ret_offset_scalar: 8557; SI: ; %bb.0: 8558; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8559; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8560; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill 8561; SI-NEXT: s_mov_b64 exec, s[34:35] 8562; SI-NEXT: s_waitcnt expcnt(0) 8563; SI-NEXT: v_writelane_b32 v10, s6, 0 8564; SI-NEXT: v_writelane_b32 v10, s7, 1 8565; SI-NEXT: s_mov_b32 s35, s7 8566; SI-NEXT: s_mov_b32 s34, s6 8567; SI-NEXT: s_mov_b32 s7, 0xf000 8568; SI-NEXT: s_mov_b32 s6, -1 8569; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 8570; SI-NEXT: s_mov_b64 s[36:37], 0 8571; SI-NEXT: v_mov_b32_e32 v4, s35 8572; SI-NEXT: v_mov_b32_e32 v5, s34 8573; SI-NEXT: .LBB124_1: ; %atomicrmw.start 8574; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8575; SI-NEXT: s_waitcnt vmcnt(0) 8576; SI-NEXT: v_mov_b32_e32 v9, v1 8577; SI-NEXT: v_mov_b32_e32 v8, v0 8578; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] 8579; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 8580; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 8581; SI-NEXT: s_waitcnt expcnt(0) 8582; SI-NEXT: v_mov_b32_e32 v0, v6 8583; SI-NEXT: v_mov_b32_e32 v1, v7 8584; SI-NEXT: v_mov_b32_e32 v2, v8 8585; SI-NEXT: v_mov_b32_e32 v3, v9 8586; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc 8587; SI-NEXT: s_waitcnt vmcnt(0) 8588; SI-NEXT: buffer_wbinvl1 8589; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 8590; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] 8591; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] 8592; SI-NEXT: s_cbranch_execnz .LBB124_1 8593; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8594; SI-NEXT: s_or_b64 exec, exec, s[36:37] 8595; SI-NEXT: v_readlane_b32 s7, v10, 1 8596; SI-NEXT: v_readlane_b32 s6, v10, 0 8597; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 8598; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload 8599; SI-NEXT: s_mov_b64 exec, s[34:35] 8600; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8601; SI-NEXT: s_setpc_b64 s[30:31] 8602; 8603; VI-LABEL: global_atomic_min_i64_ret_offset_scalar: 8604; VI: ; %bb.0: 8605; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8606; VI-NEXT: s_add_u32 s34, s4, 32 8607; VI-NEXT: s_addc_u32 s35, s5, 0 8608; VI-NEXT: v_mov_b32_e32 v2, s34 8609; VI-NEXT: v_mov_b32_e32 v3, s35 8610; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] 8611; VI-NEXT: s_mov_b64 s[34:35], 0 8612; VI-NEXT: v_mov_b32_e32 v4, s7 8613; VI-NEXT: v_mov_b32_e32 v5, s6 8614; VI-NEXT: .LBB124_1: ; %atomicrmw.start 8615; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8616; VI-NEXT: s_waitcnt vmcnt(0) 8617; VI-NEXT: v_mov_b32_e32 v9, v1 8618; VI-NEXT: v_mov_b32_e32 v8, v0 8619; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] 8620; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 8621; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 8622; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc 8623; VI-NEXT: s_waitcnt vmcnt(0) 8624; VI-NEXT: buffer_wbinvl1_vol 8625; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 8626; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8627; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] 8628; VI-NEXT: s_cbranch_execnz .LBB124_1 8629; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8630; VI-NEXT: s_or_b64 exec, exec, s[34:35] 8631; VI-NEXT: s_setpc_b64 s[30:31] 8632; 8633; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar: 8634; GFX9: ; %bb.0: 8635; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8636; GFX9-NEXT: v_mov_b32_e32 v2, 0 8637; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 8638; GFX9-NEXT: s_mov_b64 s[34:35], 0 8639; GFX9-NEXT: v_mov_b32_e32 v3, s7 8640; GFX9-NEXT: v_mov_b32_e32 v4, s6 8641; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start 8642; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8643; GFX9-NEXT: s_waitcnt vmcnt(0) 8644; GFX9-NEXT: v_mov_b32_e32 v8, v1 8645; GFX9-NEXT: v_mov_b32_e32 v7, v0 8646; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] 8647; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc 8648; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc 8649; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc 8650; GFX9-NEXT: s_waitcnt vmcnt(0) 8651; GFX9-NEXT: buffer_wbinvl1_vol 8652; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 8653; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 8654; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] 8655; GFX9-NEXT: s_cbranch_execnz .LBB124_1 8656; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8657; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] 8658; GFX9-NEXT: s_setpc_b64 s[30:31] 8659 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 8660 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst 8661 ret i64 %result 8662} 8663 8664define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { 8665; SI-LABEL: atomic_min_i64_addr64_offset: 8666; SI: ; %bb.0: ; %entry 8667; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 8668; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8669; SI-NEXT: s_waitcnt lgkmcnt(0) 8670; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 8671; SI-NEXT: s_add_u32 s4, s0, s4 8672; SI-NEXT: s_addc_u32 s5, s1, s5 8673; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 8674; SI-NEXT: s_mov_b64 s[0:1], 0 8675; SI-NEXT: s_mov_b32 s7, 0xf000 8676; SI-NEXT: v_mov_b32_e32 v4, s3 8677; SI-NEXT: v_mov_b32_e32 v5, s2 8678; SI-NEXT: s_waitcnt lgkmcnt(0) 8679; SI-NEXT: v_mov_b32_e32 v2, s8 8680; SI-NEXT: v_mov_b32_e32 v3, s9 8681; SI-NEXT: s_mov_b32 s6, -1 8682; SI-NEXT: .LBB125_1: ; %atomicrmw.start 8683; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8684; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] 8685; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8686; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 8687; SI-NEXT: s_waitcnt expcnt(0) 8688; SI-NEXT: v_mov_b32_e32 v9, v3 8689; SI-NEXT: v_mov_b32_e32 v8, v2 8690; SI-NEXT: v_mov_b32_e32 v7, v1 8691; SI-NEXT: v_mov_b32_e32 v6, v0 8692; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc 8693; SI-NEXT: s_waitcnt vmcnt(0) 8694; SI-NEXT: buffer_wbinvl1 8695; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 8696; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8697; SI-NEXT: v_mov_b32_e32 v2, v6 8698; SI-NEXT: v_mov_b32_e32 v3, v7 8699; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 8700; SI-NEXT: s_cbranch_execnz .LBB125_1 8701; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8702; SI-NEXT: s_endpgm 8703; 8704; VI-LABEL: atomic_min_i64_addr64_offset: 8705; VI: ; %bb.0: ; %entry 8706; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 8707; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8708; VI-NEXT: s_mov_b64 s[4:5], 0 8709; VI-NEXT: s_waitcnt lgkmcnt(0) 8710; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 8711; VI-NEXT: s_add_u32 s0, s0, s6 8712; VI-NEXT: s_addc_u32 s1, s1, s7 8713; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 8714; VI-NEXT: s_add_u32 s0, s0, 32 8715; VI-NEXT: s_addc_u32 s1, s1, 0 8716; VI-NEXT: v_mov_b32_e32 v5, s1 8717; VI-NEXT: v_mov_b32_e32 v6, s3 8718; VI-NEXT: s_waitcnt lgkmcnt(0) 8719; VI-NEXT: v_mov_b32_e32 v2, s6 8720; VI-NEXT: v_mov_b32_e32 v7, s2 8721; VI-NEXT: v_mov_b32_e32 v3, s7 8722; VI-NEXT: v_mov_b32_e32 v4, s0 8723; VI-NEXT: .LBB125_1: ; %atomicrmw.start 8724; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8725; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] 8726; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 8727; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 8728; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 8729; VI-NEXT: s_waitcnt vmcnt(0) 8730; VI-NEXT: buffer_wbinvl1_vol 8731; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8732; VI-NEXT: v_mov_b32_e32 v3, v1 8733; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8734; VI-NEXT: v_mov_b32_e32 v2, v0 8735; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8736; VI-NEXT: s_cbranch_execnz .LBB125_1 8737; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8738; VI-NEXT: s_endpgm 8739; 8740; GFX9-LABEL: atomic_min_i64_addr64_offset: 8741; GFX9: ; %bb.0: ; %entry 8742; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 8743; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8744; GFX9-NEXT: v_mov_b32_e32 v6, 0 8745; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8746; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 8747; GFX9-NEXT: s_add_u32 s0, s0, s4 8748; GFX9-NEXT: s_addc_u32 s1, s1, s5 8749; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 8750; GFX9-NEXT: s_mov_b64 s[4:5], 0 8751; GFX9-NEXT: v_mov_b32_e32 v4, s3 8752; GFX9-NEXT: v_mov_b32_e32 v5, s2 8753; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8754; GFX9-NEXT: v_mov_b32_e32 v2, s6 8755; GFX9-NEXT: v_mov_b32_e32 v3, s7 8756; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start 8757; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8758; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] 8759; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8760; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 8761; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc 8762; GFX9-NEXT: s_waitcnt vmcnt(0) 8763; GFX9-NEXT: buffer_wbinvl1_vol 8764; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8765; GFX9-NEXT: v_mov_b32_e32 v3, v1 8766; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8767; GFX9-NEXT: v_mov_b32_e32 v2, v0 8768; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8769; GFX9-NEXT: s_cbranch_execnz .LBB125_1 8770; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8771; GFX9-NEXT: s_endpgm 8772entry: 8773 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 8774 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 8775 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst 8776 ret void 8777} 8778 8779define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { 8780; SI-LABEL: atomic_min_i64_ret_addr64_offset: 8781; SI: ; %bb.0: ; %entry 8782; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 8783; SI-NEXT: s_waitcnt lgkmcnt(0) 8784; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 8785; SI-NEXT: s_add_u32 s8, s0, s6 8786; SI-NEXT: s_addc_u32 s9, s1, s7 8787; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 8788; SI-NEXT: s_mov_b64 s[0:1], 0 8789; SI-NEXT: s_mov_b32 s11, 0xf000 8790; SI-NEXT: v_mov_b32_e32 v8, s5 8791; SI-NEXT: v_mov_b32_e32 v9, s4 8792; SI-NEXT: s_waitcnt lgkmcnt(0) 8793; SI-NEXT: v_mov_b32_e32 v2, s6 8794; SI-NEXT: v_mov_b32_e32 v3, s7 8795; SI-NEXT: s_mov_b32 s10, -1 8796; SI-NEXT: .LBB126_1: ; %atomicrmw.start 8797; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8798; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] 8799; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc 8800; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc 8801; SI-NEXT: s_waitcnt expcnt(0) 8802; SI-NEXT: v_mov_b32_e32 v7, v3 8803; SI-NEXT: v_mov_b32_e32 v6, v2 8804; SI-NEXT: v_mov_b32_e32 v5, v1 8805; SI-NEXT: v_mov_b32_e32 v4, v0 8806; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc 8807; SI-NEXT: s_waitcnt vmcnt(0) 8808; SI-NEXT: buffer_wbinvl1 8809; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 8810; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8811; SI-NEXT: v_mov_b32_e32 v2, v4 8812; SI-NEXT: v_mov_b32_e32 v3, v5 8813; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 8814; SI-NEXT: s_cbranch_execnz .LBB126_1 8815; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8816; SI-NEXT: s_or_b64 exec, exec, s[0:1] 8817; SI-NEXT: s_mov_b32 s7, 0xf000 8818; SI-NEXT: s_mov_b32 s6, -1 8819; SI-NEXT: s_mov_b32 s4, s2 8820; SI-NEXT: s_mov_b32 s5, s3 8821; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 8822; SI-NEXT: s_endpgm 8823; 8824; VI-LABEL: atomic_min_i64_ret_addr64_offset: 8825; VI: ; %bb.0: ; %entry 8826; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 8827; VI-NEXT: s_mov_b64 s[8:9], 0 8828; VI-NEXT: s_waitcnt lgkmcnt(0) 8829; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 8830; VI-NEXT: s_add_u32 s0, s0, s6 8831; VI-NEXT: s_addc_u32 s1, s1, s7 8832; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 8833; VI-NEXT: s_add_u32 s0, s0, 32 8834; VI-NEXT: s_addc_u32 s1, s1, 0 8835; VI-NEXT: v_mov_b32_e32 v0, s0 8836; VI-NEXT: v_mov_b32_e32 v4, s5 8837; VI-NEXT: s_waitcnt lgkmcnt(0) 8838; VI-NEXT: v_mov_b32_e32 v2, s6 8839; VI-NEXT: v_mov_b32_e32 v5, s4 8840; VI-NEXT: v_mov_b32_e32 v3, s7 8841; VI-NEXT: v_mov_b32_e32 v1, s1 8842; VI-NEXT: .LBB126_1: ; %atomicrmw.start 8843; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8844; VI-NEXT: v_mov_b32_e32 v9, v3 8845; VI-NEXT: v_mov_b32_e32 v8, v2 8846; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] 8847; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 8848; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 8849; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 8850; VI-NEXT: s_waitcnt vmcnt(0) 8851; VI-NEXT: buffer_wbinvl1_vol 8852; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 8853; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 8854; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] 8855; VI-NEXT: s_cbranch_execnz .LBB126_1 8856; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8857; VI-NEXT: s_or_b64 exec, exec, s[8:9] 8858; VI-NEXT: v_mov_b32_e32 v0, s2 8859; VI-NEXT: v_mov_b32_e32 v1, s3 8860; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 8861; VI-NEXT: s_endpgm 8862; 8863; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: 8864; GFX9: ; %bb.0: ; %entry 8865; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 8866; GFX9-NEXT: s_mov_b64 s[2:3], 0 8867; GFX9-NEXT: v_mov_b32_e32 v4, 0 8868; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8869; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 8870; GFX9-NEXT: s_add_u32 s0, s8, s0 8871; GFX9-NEXT: s_addc_u32 s1, s9, s1 8872; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 8873; GFX9-NEXT: v_mov_b32_e32 v2, s13 8874; GFX9-NEXT: v_mov_b32_e32 v3, s12 8875; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8876; GFX9-NEXT: v_mov_b32_e32 v0, s4 8877; GFX9-NEXT: v_mov_b32_e32 v1, s5 8878; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start 8879; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8880; GFX9-NEXT: v_mov_b32_e32 v8, v1 8881; GFX9-NEXT: v_mov_b32_e32 v7, v0 8882; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] 8883; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 8884; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 8885; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc 8886; GFX9-NEXT: s_waitcnt vmcnt(0) 8887; GFX9-NEXT: buffer_wbinvl1_vol 8888; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 8889; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 8890; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 8891; GFX9-NEXT: s_cbranch_execnz .LBB126_1 8892; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 8893; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 8894; GFX9-NEXT: v_mov_b32_e32 v2, 0 8895; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] 8896; GFX9-NEXT: s_endpgm 8897entry: 8898 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 8899 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 8900 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst 8901 store i64 %tmp0, ptr addrspace(1) %out2 8902 ret void 8903} 8904 8905define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { 8906; SI-LABEL: atomic_min_i64: 8907; SI: ; %bb.0: ; %entry 8908; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8909; SI-NEXT: s_waitcnt lgkmcnt(0) 8910; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 8911; SI-NEXT: s_mov_b64 s[8:9], 0 8912; SI-NEXT: s_mov_b32 s7, 0xf000 8913; SI-NEXT: v_mov_b32_e32 v4, s3 8914; SI-NEXT: v_mov_b32_e32 v5, s2 8915; SI-NEXT: s_waitcnt lgkmcnt(0) 8916; SI-NEXT: v_mov_b32_e32 v2, s4 8917; SI-NEXT: v_mov_b32_e32 v3, s5 8918; SI-NEXT: s_mov_b32 s6, -1 8919; SI-NEXT: s_mov_b32 s4, s0 8920; SI-NEXT: s_mov_b32 s5, s1 8921; SI-NEXT: .LBB127_1: ; %atomicrmw.start 8922; SI-NEXT: ; =>This Inner Loop Header: Depth=1 8923; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] 8924; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8925; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 8926; SI-NEXT: s_waitcnt expcnt(0) 8927; SI-NEXT: v_mov_b32_e32 v9, v3 8928; SI-NEXT: v_mov_b32_e32 v8, v2 8929; SI-NEXT: v_mov_b32_e32 v7, v1 8930; SI-NEXT: v_mov_b32_e32 v6, v0 8931; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc 8932; SI-NEXT: s_waitcnt vmcnt(0) 8933; SI-NEXT: buffer_wbinvl1 8934; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 8935; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 8936; SI-NEXT: v_mov_b32_e32 v2, v6 8937; SI-NEXT: v_mov_b32_e32 v3, v7 8938; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 8939; SI-NEXT: s_cbranch_execnz .LBB127_1 8940; SI-NEXT: ; %bb.2: ; %atomicrmw.end 8941; SI-NEXT: s_endpgm 8942; 8943; VI-LABEL: atomic_min_i64: 8944; VI: ; %bb.0: ; %entry 8945; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8946; VI-NEXT: s_mov_b64 s[4:5], 0 8947; VI-NEXT: s_waitcnt lgkmcnt(0) 8948; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 8949; VI-NEXT: v_mov_b32_e32 v5, s1 8950; VI-NEXT: v_mov_b32_e32 v6, s3 8951; VI-NEXT: v_mov_b32_e32 v7, s2 8952; VI-NEXT: v_mov_b32_e32 v4, s0 8953; VI-NEXT: s_waitcnt lgkmcnt(0) 8954; VI-NEXT: v_mov_b32_e32 v2, s6 8955; VI-NEXT: v_mov_b32_e32 v3, s7 8956; VI-NEXT: .LBB127_1: ; %atomicrmw.start 8957; VI-NEXT: ; =>This Inner Loop Header: Depth=1 8958; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] 8959; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc 8960; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc 8961; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc 8962; VI-NEXT: s_waitcnt vmcnt(0) 8963; VI-NEXT: buffer_wbinvl1_vol 8964; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8965; VI-NEXT: v_mov_b32_e32 v3, v1 8966; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8967; VI-NEXT: v_mov_b32_e32 v2, v0 8968; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 8969; VI-NEXT: s_cbranch_execnz .LBB127_1 8970; VI-NEXT: ; %bb.2: ; %atomicrmw.end 8971; VI-NEXT: s_endpgm 8972; 8973; GFX9-LABEL: atomic_min_i64: 8974; GFX9: ; %bb.0: ; %entry 8975; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8976; GFX9-NEXT: s_mov_b64 s[4:5], 0 8977; GFX9-NEXT: v_mov_b32_e32 v6, 0 8978; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8979; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 8980; GFX9-NEXT: v_mov_b32_e32 v4, s3 8981; GFX9-NEXT: v_mov_b32_e32 v5, s2 8982; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8983; GFX9-NEXT: v_mov_b32_e32 v2, s6 8984; GFX9-NEXT: v_mov_b32_e32 v3, s7 8985; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start 8986; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 8987; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] 8988; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8989; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc 8990; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc 8991; GFX9-NEXT: s_waitcnt vmcnt(0) 8992; GFX9-NEXT: buffer_wbinvl1_vol 8993; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 8994; GFX9-NEXT: v_mov_b32_e32 v3, v1 8995; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8996; GFX9-NEXT: v_mov_b32_e32 v2, v0 8997; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 8998; GFX9-NEXT: s_cbranch_execnz .LBB127_1 8999; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 9000; GFX9-NEXT: s_endpgm 9001entry: 9002 %tmp0 = atomicrmw min ptr addrspace(1) %out, i64 %in seq_cst 9003 ret void 9004} 9005 9006define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { 9007; SI-LABEL: atomic_min_i64_ret_addr64: 9008; SI: ; %bb.0: ; %entry 9009; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 9010; SI-NEXT: s_waitcnt lgkmcnt(0) 9011; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 9012; SI-NEXT: s_add_u32 s8, s0, s6 9013; SI-NEXT: s_addc_u32 s9, s1, s7 9014; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9015; SI-NEXT: s_mov_b64 s[0:1], 0 9016; SI-NEXT: s_mov_b32 s11, 0xf000 9017; SI-NEXT: v_mov_b32_e32 v8, s5 9018; SI-NEXT: v_mov_b32_e32 v9, s4 9019; SI-NEXT: s_waitcnt lgkmcnt(0) 9020; SI-NEXT: v_mov_b32_e32 v2, s6 9021; SI-NEXT: v_mov_b32_e32 v3, s7 9022; SI-NEXT: s_mov_b32 s10, -1 9023; SI-NEXT: .LBB128_1: ; %atomicrmw.start 9024; SI-NEXT: ; =>This Inner Loop Header: Depth=1 9025; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] 9026; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc 9027; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc 9028; SI-NEXT: s_waitcnt expcnt(0) 9029; SI-NEXT: v_mov_b32_e32 v7, v3 9030; SI-NEXT: v_mov_b32_e32 v6, v2 9031; SI-NEXT: v_mov_b32_e32 v5, v1 9032; SI-NEXT: v_mov_b32_e32 v4, v0 9033; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc 9034; SI-NEXT: s_waitcnt vmcnt(0) 9035; SI-NEXT: buffer_wbinvl1 9036; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] 9037; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9038; SI-NEXT: v_mov_b32_e32 v2, v4 9039; SI-NEXT: v_mov_b32_e32 v3, v5 9040; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] 9041; SI-NEXT: s_cbranch_execnz .LBB128_1 9042; SI-NEXT: ; %bb.2: ; %atomicrmw.end 9043; SI-NEXT: s_or_b64 exec, exec, s[0:1] 9044; SI-NEXT: s_mov_b32 s7, 0xf000 9045; SI-NEXT: s_mov_b32 s6, -1 9046; SI-NEXT: s_mov_b32 s4, s2 9047; SI-NEXT: s_mov_b32 s5, s3 9048; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 9049; SI-NEXT: s_endpgm 9050; 9051; VI-LABEL: atomic_min_i64_ret_addr64: 9052; VI: ; %bb.0: ; %entry 9053; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 9054; VI-NEXT: s_waitcnt lgkmcnt(0) 9055; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 9056; VI-NEXT: s_add_u32 s6, s0, s6 9057; VI-NEXT: s_addc_u32 s7, s1, s7 9058; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 9059; VI-NEXT: v_mov_b32_e32 v0, s6 9060; VI-NEXT: s_mov_b64 s[0:1], 0 9061; VI-NEXT: v_mov_b32_e32 v4, s5 9062; VI-NEXT: v_mov_b32_e32 v5, s4 9063; VI-NEXT: s_waitcnt lgkmcnt(0) 9064; VI-NEXT: v_mov_b32_e32 v2, s8 9065; VI-NEXT: v_mov_b32_e32 v3, s9 9066; VI-NEXT: v_mov_b32_e32 v1, s7 9067; VI-NEXT: .LBB128_1: ; %atomicrmw.start 9068; VI-NEXT: ; =>This Inner Loop Header: Depth=1 9069; VI-NEXT: v_mov_b32_e32 v9, v3 9070; VI-NEXT: v_mov_b32_e32 v8, v2 9071; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] 9072; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc 9073; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc 9074; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 9075; VI-NEXT: s_waitcnt vmcnt(0) 9076; VI-NEXT: buffer_wbinvl1_vol 9077; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 9078; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9079; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] 9080; VI-NEXT: s_cbranch_execnz .LBB128_1 9081; VI-NEXT: ; %bb.2: ; %atomicrmw.end 9082; VI-NEXT: s_or_b64 exec, exec, s[0:1] 9083; VI-NEXT: v_mov_b32_e32 v0, s2 9084; VI-NEXT: v_mov_b32_e32 v1, s3 9085; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 9086; VI-NEXT: s_endpgm 9087; 9088; GFX9-LABEL: atomic_min_i64_ret_addr64: 9089; GFX9: ; %bb.0: ; %entry 9090; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 9091; GFX9-NEXT: s_mov_b64 s[2:3], 0 9092; GFX9-NEXT: v_mov_b32_e32 v4, 0 9093; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9094; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 9095; GFX9-NEXT: s_add_u32 s0, s8, s0 9096; GFX9-NEXT: s_addc_u32 s1, s9, s1 9097; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 9098; GFX9-NEXT: v_mov_b32_e32 v2, s13 9099; GFX9-NEXT: v_mov_b32_e32 v3, s12 9100; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9101; GFX9-NEXT: v_mov_b32_e32 v0, s4 9102; GFX9-NEXT: v_mov_b32_e32 v1, s5 9103; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start 9104; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 9105; GFX9-NEXT: v_mov_b32_e32 v8, v1 9106; GFX9-NEXT: v_mov_b32_e32 v7, v0 9107; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] 9108; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 9109; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 9110; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc 9111; GFX9-NEXT: s_waitcnt vmcnt(0) 9112; GFX9-NEXT: buffer_wbinvl1_vol 9113; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] 9114; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 9115; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 9116; GFX9-NEXT: s_cbranch_execnz .LBB128_1 9117; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 9118; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 9119; GFX9-NEXT: v_mov_b32_e32 v2, 0 9120; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] 9121; GFX9-NEXT: s_endpgm 9122entry: 9123 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index 9124 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst 9125 store i64 %tmp0, ptr addrspace(1) %out2 9126 ret void 9127} 9128 9129define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 9130; SI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: 9131; SI: ; %bb.0: 9132; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9133; SI-NEXT: s_mov_b32 s6, 0 9134; SI-NEXT: s_mov_b32 s7, 0xf000 9135; SI-NEXT: s_mov_b32 s4, s6 9136; SI-NEXT: s_mov_b32 s5, s6 9137; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 9138; SI-NEXT: s_mov_b64 s[8:9], 0 9139; SI-NEXT: .LBB129_1: ; %atomicrmw.start 9140; SI-NEXT: ; =>This Inner Loop Header: Depth=1 9141; SI-NEXT: s_waitcnt vmcnt(0) 9142; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 9143; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 9144; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 9145; SI-NEXT: s_waitcnt expcnt(0) 9146; SI-NEXT: v_mov_b32_e32 v11, v7 9147; SI-NEXT: v_mov_b32_e32 v10, v6 9148; SI-NEXT: v_mov_b32_e32 v9, v5 9149; SI-NEXT: v_mov_b32_e32 v8, v4 9150; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 9151; SI-NEXT: s_waitcnt vmcnt(0) 9152; SI-NEXT: buffer_wbinvl1 9153; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] 9154; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 9155; SI-NEXT: v_mov_b32_e32 v6, v8 9156; SI-NEXT: v_mov_b32_e32 v7, v9 9157; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 9158; SI-NEXT: s_cbranch_execnz .LBB129_1 9159; SI-NEXT: ; %bb.2: ; %atomicrmw.end 9160; SI-NEXT: s_or_b64 exec, exec, s[8:9] 9161; SI-NEXT: s_waitcnt expcnt(0) 9162; SI-NEXT: s_setpc_b64 s[30:31] 9163; 9164; VI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: 9165; VI: ; %bb.0: 9166; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9167; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9168; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9169; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 9170; VI-NEXT: s_mov_b64 s[4:5], 0 9171; VI-NEXT: .LBB129_1: ; %atomicrmw.start 9172; VI-NEXT: ; =>This Inner Loop Header: Depth=1 9173; VI-NEXT: s_waitcnt vmcnt(0) 9174; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 9175; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 9176; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 9177; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 9178; VI-NEXT: s_waitcnt vmcnt(0) 9179; VI-NEXT: buffer_wbinvl1_vol 9180; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 9181; VI-NEXT: v_mov_b32_e32 v7, v5 9182; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9183; VI-NEXT: v_mov_b32_e32 v6, v4 9184; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 9185; VI-NEXT: s_cbranch_execnz .LBB129_1 9186; VI-NEXT: ; %bb.2: ; %atomicrmw.end 9187; VI-NEXT: s_or_b64 exec, exec, s[4:5] 9188; VI-NEXT: s_setpc_b64 s[30:31] 9189; 9190; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: 9191; GFX9: ; %bb.0: 9192; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9193; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 9194; GFX9-NEXT: s_mov_b64 s[4:5], 0 9195; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start 9196; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 9197; GFX9-NEXT: s_waitcnt vmcnt(0) 9198; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 9199; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 9200; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 9201; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 9202; GFX9-NEXT: s_waitcnt vmcnt(0) 9203; GFX9-NEXT: buffer_wbinvl1_vol 9204; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 9205; GFX9-NEXT: v_mov_b32_e32 v7, v5 9206; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9207; GFX9-NEXT: v_mov_b32_e32 v6, v4 9208; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 9209; GFX9-NEXT: s_cbranch_execnz .LBB129_1 9210; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 9211; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 9212; GFX9-NEXT: s_setpc_b64 s[30:31] 9213 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9214 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 9215 ret void 9216} 9217 9218define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 9219; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: 9220; SI: ; %bb.0: 9221; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9222; SI-NEXT: v_mov_b32_e32 v5, v3 9223; SI-NEXT: v_mov_b32_e32 v4, v2 9224; SI-NEXT: v_mov_b32_e32 v7, v1 9225; SI-NEXT: v_mov_b32_e32 v6, v0 9226; SI-NEXT: s_mov_b32 s6, 0 9227; SI-NEXT: s_mov_b32 s7, 0xf000 9228; SI-NEXT: s_mov_b32 s4, s6 9229; SI-NEXT: s_mov_b32 s5, s6 9230; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 9231; SI-NEXT: s_mov_b64 s[8:9], 0 9232; SI-NEXT: .LBB130_1: ; %atomicrmw.start 9233; SI-NEXT: ; =>This Inner Loop Header: Depth=1 9234; SI-NEXT: s_waitcnt vmcnt(0) 9235; SI-NEXT: v_mov_b32_e32 v11, v1 9236; SI-NEXT: v_mov_b32_e32 v10, v0 9237; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5] 9238; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc 9239; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc 9240; SI-NEXT: s_waitcnt expcnt(0) 9241; SI-NEXT: v_mov_b32_e32 v0, v8 9242; SI-NEXT: v_mov_b32_e32 v1, v9 9243; SI-NEXT: v_mov_b32_e32 v2, v10 9244; SI-NEXT: v_mov_b32_e32 v3, v11 9245; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc 9246; SI-NEXT: s_waitcnt vmcnt(0) 9247; SI-NEXT: buffer_wbinvl1 9248; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 9249; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] 9250; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] 9251; SI-NEXT: s_cbranch_execnz .LBB130_1 9252; SI-NEXT: ; %bb.2: ; %atomicrmw.end 9253; SI-NEXT: s_or_b64 exec, exec, s[8:9] 9254; SI-NEXT: s_waitcnt expcnt(0) 9255; SI-NEXT: s_setpc_b64 s[30:31] 9256; 9257; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: 9258; VI: ; %bb.0: 9259; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9260; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 9261; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 9262; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 9263; VI-NEXT: s_mov_b64 s[4:5], 0 9264; VI-NEXT: .LBB130_1: ; %atomicrmw.start 9265; VI-NEXT: ; =>This Inner Loop Header: Depth=1 9266; VI-NEXT: s_waitcnt vmcnt(0) 9267; VI-NEXT: v_mov_b32_e32 v9, v1 9268; VI-NEXT: v_mov_b32_e32 v8, v0 9269; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] 9270; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc 9271; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc 9272; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 9273; VI-NEXT: s_waitcnt vmcnt(0) 9274; VI-NEXT: buffer_wbinvl1_vol 9275; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 9276; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9277; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] 9278; VI-NEXT: s_cbranch_execnz .LBB130_1 9279; VI-NEXT: ; %bb.2: ; %atomicrmw.end 9280; VI-NEXT: s_or_b64 exec, exec, s[4:5] 9281; VI-NEXT: s_setpc_b64 s[30:31] 9282; 9283; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: 9284; GFX9: ; %bb.0: 9285; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9286; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 9287; GFX9-NEXT: s_mov_b64 s[4:5], 0 9288; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start 9289; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 9290; GFX9-NEXT: s_waitcnt vmcnt(0) 9291; GFX9-NEXT: v_mov_b32_e32 v7, v5 9292; GFX9-NEXT: v_mov_b32_e32 v6, v4 9293; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] 9294; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc 9295; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc 9296; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc 9297; GFX9-NEXT: s_waitcnt vmcnt(0) 9298; GFX9-NEXT: buffer_wbinvl1_vol 9299; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 9300; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9301; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 9302; GFX9-NEXT: s_cbranch_execnz .LBB130_1 9303; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 9304; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 9305; GFX9-NEXT: v_mov_b32_e32 v0, v4 9306; GFX9-NEXT: v_mov_b32_e32 v1, v5 9307; GFX9-NEXT: s_setpc_b64 s[30:31] 9308 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9309 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 9310 ret i64 %result 9311} 9312 9313; --------------------------------------------------------------------- 9314; atomicrmw uinc_wrap 9315; --------------------------------------------------------------------- 9316 9317define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 9318; SI-LABEL: global_atomic_uinc_wrap_i64_noret: 9319; SI: ; %bb.0: 9320; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9321; SI-NEXT: s_mov_b32 s6, 0 9322; SI-NEXT: s_mov_b32 s7, 0xf000 9323; SI-NEXT: s_mov_b32 s4, s6 9324; SI-NEXT: s_mov_b32 s5, s6 9325; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 9326; SI-NEXT: s_waitcnt vmcnt(0) 9327; SI-NEXT: buffer_wbinvl1 9328; SI-NEXT: s_waitcnt expcnt(0) 9329; SI-NEXT: s_setpc_b64 s[30:31] 9330; 9331; VI-LABEL: global_atomic_uinc_wrap_i64_noret: 9332; VI: ; %bb.0: 9333; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9334; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] 9335; VI-NEXT: s_waitcnt vmcnt(0) 9336; VI-NEXT: buffer_wbinvl1_vol 9337; VI-NEXT: s_setpc_b64 s[30:31] 9338; 9339; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret: 9340; GFX9: ; %bb.0: 9341; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9342; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off 9343; GFX9-NEXT: s_waitcnt vmcnt(0) 9344; GFX9-NEXT: buffer_wbinvl1_vol 9345; GFX9-NEXT: s_setpc_b64 s[30:31] 9346 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9347 ret void 9348} 9349 9350define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 9351; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset: 9352; SI: ; %bb.0: 9353; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9354; SI-NEXT: s_mov_b32 s6, 0 9355; SI-NEXT: s_mov_b32 s7, 0xf000 9356; SI-NEXT: s_mov_b32 s4, s6 9357; SI-NEXT: s_mov_b32 s5, s6 9358; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 9359; SI-NEXT: s_waitcnt vmcnt(0) 9360; SI-NEXT: buffer_wbinvl1 9361; SI-NEXT: s_waitcnt expcnt(0) 9362; SI-NEXT: s_setpc_b64 s[30:31] 9363; 9364; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset: 9365; VI: ; %bb.0: 9366; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9367; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9368; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9369; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] 9370; VI-NEXT: s_waitcnt vmcnt(0) 9371; VI-NEXT: buffer_wbinvl1_vol 9372; VI-NEXT: s_setpc_b64 s[30:31] 9373; 9374; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset: 9375; GFX9: ; %bb.0: 9376; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9377; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 9378; GFX9-NEXT: s_waitcnt vmcnt(0) 9379; GFX9-NEXT: buffer_wbinvl1_vol 9380; GFX9-NEXT: s_setpc_b64 s[30:31] 9381 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9382 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst 9383 ret void 9384} 9385 9386define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 9387; SI-LABEL: global_atomic_uinc_wrap_i64_ret: 9388; SI: ; %bb.0: 9389; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9390; SI-NEXT: s_mov_b32 s6, 0 9391; SI-NEXT: s_mov_b32 s7, 0xf000 9392; SI-NEXT: s_mov_b32 s4, s6 9393; SI-NEXT: s_mov_b32 s5, s6 9394; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 9395; SI-NEXT: s_waitcnt vmcnt(0) 9396; SI-NEXT: buffer_wbinvl1 9397; SI-NEXT: v_mov_b32_e32 v0, v2 9398; SI-NEXT: v_mov_b32_e32 v1, v3 9399; SI-NEXT: s_waitcnt expcnt(0) 9400; SI-NEXT: s_setpc_b64 s[30:31] 9401; 9402; VI-LABEL: global_atomic_uinc_wrap_i64_ret: 9403; VI: ; %bb.0: 9404; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9405; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc 9406; VI-NEXT: s_waitcnt vmcnt(0) 9407; VI-NEXT: buffer_wbinvl1_vol 9408; VI-NEXT: s_setpc_b64 s[30:31] 9409; 9410; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret: 9411; GFX9: ; %bb.0: 9412; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9413; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc 9414; GFX9-NEXT: s_waitcnt vmcnt(0) 9415; GFX9-NEXT: buffer_wbinvl1_vol 9416; GFX9-NEXT: s_setpc_b64 s[30:31] 9417 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9418 ret i64 %result 9419} 9420 9421define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 9422; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: 9423; SI: ; %bb.0: 9424; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9425; SI-NEXT: s_mov_b32 s6, 0 9426; SI-NEXT: s_mov_b32 s7, 0xf000 9427; SI-NEXT: s_mov_b32 s4, s6 9428; SI-NEXT: s_mov_b32 s5, s6 9429; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 9430; SI-NEXT: s_waitcnt vmcnt(0) 9431; SI-NEXT: buffer_wbinvl1 9432; SI-NEXT: v_mov_b32_e32 v0, v2 9433; SI-NEXT: v_mov_b32_e32 v1, v3 9434; SI-NEXT: s_waitcnt expcnt(0) 9435; SI-NEXT: s_setpc_b64 s[30:31] 9436; 9437; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: 9438; VI: ; %bb.0: 9439; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9440; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9441; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9442; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc 9443; VI-NEXT: s_waitcnt vmcnt(0) 9444; VI-NEXT: buffer_wbinvl1_vol 9445; VI-NEXT: s_setpc_b64 s[30:31] 9446; 9447; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset: 9448; GFX9: ; %bb.0: 9449; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9450; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 9451; GFX9-NEXT: s_waitcnt vmcnt(0) 9452; GFX9-NEXT: buffer_wbinvl1_vol 9453; GFX9-NEXT: s_setpc_b64 s[30:31] 9454 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9455 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst 9456 ret i64 %result 9457} 9458 9459define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 9460; SI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: 9461; SI: ; %bb.0: 9462; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9463; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9464; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 9465; SI-NEXT: s_mov_b64 exec, s[34:35] 9466; SI-NEXT: s_waitcnt expcnt(0) 9467; SI-NEXT: v_writelane_b32 v2, s6, 0 9468; SI-NEXT: v_writelane_b32 v2, s7, 1 9469; SI-NEXT: s_mov_b32 s34, s7 9470; SI-NEXT: s_mov_b32 s35, s6 9471; SI-NEXT: s_mov_b32 s7, 0xf000 9472; SI-NEXT: s_mov_b32 s6, -1 9473; SI-NEXT: v_mov_b32_e32 v0, s35 9474; SI-NEXT: v_mov_b32_e32 v1, s34 9475; SI-NEXT: s_waitcnt vmcnt(0) 9476; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 9477; SI-NEXT: s_waitcnt vmcnt(0) 9478; SI-NEXT: buffer_wbinvl1 9479; SI-NEXT: v_readlane_b32 s7, v2, 1 9480; SI-NEXT: v_readlane_b32 s6, v2, 0 9481; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9482; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 9483; SI-NEXT: s_mov_b64 exec, s[34:35] 9484; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9485; SI-NEXT: s_setpc_b64 s[30:31] 9486; 9487; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: 9488; VI: ; %bb.0: 9489; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9490; VI-NEXT: v_mov_b32_e32 v0, s6 9491; VI-NEXT: v_mov_b32_e32 v1, s7 9492; VI-NEXT: v_mov_b32_e32 v2, s4 9493; VI-NEXT: v_mov_b32_e32 v3, s5 9494; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] 9495; VI-NEXT: s_waitcnt vmcnt(0) 9496; VI-NEXT: buffer_wbinvl1_vol 9497; VI-NEXT: s_setpc_b64 s[30:31] 9498; 9499; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: 9500; GFX9: ; %bb.0: 9501; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9502; GFX9-NEXT: v_mov_b32_e32 v0, s6 9503; GFX9-NEXT: v_mov_b32_e32 v1, s7 9504; GFX9-NEXT: v_mov_b32_e32 v2, 0 9505; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] 9506; GFX9-NEXT: s_waitcnt vmcnt(0) 9507; GFX9-NEXT: buffer_wbinvl1_vol 9508; GFX9-NEXT: s_setpc_b64 s[30:31] 9509 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9510 ret void 9511} 9512 9513define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 9514; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: 9515; SI: ; %bb.0: 9516; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9517; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9518; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 9519; SI-NEXT: s_mov_b64 exec, s[34:35] 9520; SI-NEXT: s_waitcnt expcnt(0) 9521; SI-NEXT: v_writelane_b32 v2, s6, 0 9522; SI-NEXT: v_writelane_b32 v2, s7, 1 9523; SI-NEXT: v_mov_b32_e32 v0, s6 9524; SI-NEXT: v_mov_b32_e32 v1, s7 9525; SI-NEXT: s_mov_b32 s7, 0xf000 9526; SI-NEXT: s_mov_b32 s6, -1 9527; SI-NEXT: s_waitcnt vmcnt(0) 9528; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 9529; SI-NEXT: s_waitcnt vmcnt(0) 9530; SI-NEXT: buffer_wbinvl1 9531; SI-NEXT: v_readlane_b32 s7, v2, 1 9532; SI-NEXT: v_readlane_b32 s6, v2, 0 9533; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9534; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 9535; SI-NEXT: s_mov_b64 exec, s[34:35] 9536; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9537; SI-NEXT: s_setpc_b64 s[30:31] 9538; 9539; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: 9540; VI: ; %bb.0: 9541; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9542; VI-NEXT: s_add_u32 s34, s4, 32 9543; VI-NEXT: s_addc_u32 s35, s5, 0 9544; VI-NEXT: v_mov_b32_e32 v2, s34 9545; VI-NEXT: v_mov_b32_e32 v0, s6 9546; VI-NEXT: v_mov_b32_e32 v1, s7 9547; VI-NEXT: v_mov_b32_e32 v3, s35 9548; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] 9549; VI-NEXT: s_waitcnt vmcnt(0) 9550; VI-NEXT: buffer_wbinvl1_vol 9551; VI-NEXT: s_setpc_b64 s[30:31] 9552; 9553; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: 9554; GFX9: ; %bb.0: 9555; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9556; GFX9-NEXT: v_mov_b32_e32 v0, s6 9557; GFX9-NEXT: v_mov_b32_e32 v1, s7 9558; GFX9-NEXT: v_mov_b32_e32 v2, 0 9559; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 9560; GFX9-NEXT: s_waitcnt vmcnt(0) 9561; GFX9-NEXT: buffer_wbinvl1_vol 9562; GFX9-NEXT: s_setpc_b64 s[30:31] 9563 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9564 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst 9565 ret void 9566} 9567 9568define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 9569; SI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: 9570; SI: ; %bb.0: 9571; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9572; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9573; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 9574; SI-NEXT: s_mov_b64 exec, s[34:35] 9575; SI-NEXT: s_waitcnt expcnt(0) 9576; SI-NEXT: v_writelane_b32 v2, s6, 0 9577; SI-NEXT: v_writelane_b32 v2, s7, 1 9578; SI-NEXT: s_mov_b32 s34, s7 9579; SI-NEXT: s_mov_b32 s35, s6 9580; SI-NEXT: s_mov_b32 s7, 0xf000 9581; SI-NEXT: s_mov_b32 s6, -1 9582; SI-NEXT: v_mov_b32_e32 v0, s35 9583; SI-NEXT: v_mov_b32_e32 v1, s34 9584; SI-NEXT: s_waitcnt vmcnt(0) 9585; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc 9586; SI-NEXT: s_waitcnt vmcnt(0) 9587; SI-NEXT: buffer_wbinvl1 9588; SI-NEXT: v_readlane_b32 s7, v2, 1 9589; SI-NEXT: v_readlane_b32 s6, v2, 0 9590; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9591; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 9592; SI-NEXT: s_mov_b64 exec, s[34:35] 9593; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9594; SI-NEXT: s_setpc_b64 s[30:31] 9595; 9596; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: 9597; VI: ; %bb.0: 9598; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9599; VI-NEXT: v_mov_b32_e32 v0, s6 9600; VI-NEXT: v_mov_b32_e32 v1, s7 9601; VI-NEXT: v_mov_b32_e32 v2, s4 9602; VI-NEXT: v_mov_b32_e32 v3, s5 9603; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 9604; VI-NEXT: s_waitcnt vmcnt(0) 9605; VI-NEXT: buffer_wbinvl1_vol 9606; VI-NEXT: s_setpc_b64 s[30:31] 9607; 9608; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: 9609; GFX9: ; %bb.0: 9610; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9611; GFX9-NEXT: v_mov_b32_e32 v0, s6 9612; GFX9-NEXT: v_mov_b32_e32 v1, s7 9613; GFX9-NEXT: v_mov_b32_e32 v2, 0 9614; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc 9615; GFX9-NEXT: s_waitcnt vmcnt(0) 9616; GFX9-NEXT: buffer_wbinvl1_vol 9617; GFX9-NEXT: s_setpc_b64 s[30:31] 9618 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9619 ret i64 %result 9620} 9621 9622define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 9623; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: 9624; SI: ; %bb.0: 9625; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9626; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9627; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 9628; SI-NEXT: s_mov_b64 exec, s[34:35] 9629; SI-NEXT: s_waitcnt expcnt(0) 9630; SI-NEXT: v_writelane_b32 v2, s6, 0 9631; SI-NEXT: v_writelane_b32 v2, s7, 1 9632; SI-NEXT: v_mov_b32_e32 v0, s6 9633; SI-NEXT: v_mov_b32_e32 v1, s7 9634; SI-NEXT: s_mov_b32 s7, 0xf000 9635; SI-NEXT: s_mov_b32 s6, -1 9636; SI-NEXT: s_waitcnt vmcnt(0) 9637; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc 9638; SI-NEXT: s_waitcnt vmcnt(0) 9639; SI-NEXT: buffer_wbinvl1 9640; SI-NEXT: v_readlane_b32 s7, v2, 1 9641; SI-NEXT: v_readlane_b32 s6, v2, 0 9642; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9643; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 9644; SI-NEXT: s_mov_b64 exec, s[34:35] 9645; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9646; SI-NEXT: s_setpc_b64 s[30:31] 9647; 9648; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: 9649; VI: ; %bb.0: 9650; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9651; VI-NEXT: s_add_u32 s34, s4, 32 9652; VI-NEXT: s_addc_u32 s35, s5, 0 9653; VI-NEXT: v_mov_b32_e32 v2, s34 9654; VI-NEXT: v_mov_b32_e32 v0, s6 9655; VI-NEXT: v_mov_b32_e32 v1, s7 9656; VI-NEXT: v_mov_b32_e32 v3, s35 9657; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 9658; VI-NEXT: s_waitcnt vmcnt(0) 9659; VI-NEXT: buffer_wbinvl1_vol 9660; VI-NEXT: s_setpc_b64 s[30:31] 9661; 9662; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: 9663; GFX9: ; %bb.0: 9664; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9665; GFX9-NEXT: v_mov_b32_e32 v0, s6 9666; GFX9-NEXT: v_mov_b32_e32 v1, s7 9667; GFX9-NEXT: v_mov_b32_e32 v2, 0 9668; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 9669; GFX9-NEXT: s_waitcnt vmcnt(0) 9670; GFX9-NEXT: buffer_wbinvl1_vol 9671; GFX9-NEXT: s_setpc_b64 s[30:31] 9672 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9673 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst 9674 ret i64 %result 9675} 9676 9677define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 9678; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: 9679; SI: ; %bb.0: 9680; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9681; SI-NEXT: s_mov_b32 s6, 0 9682; SI-NEXT: s_mov_b32 s7, 0xf000 9683; SI-NEXT: s_mov_b32 s4, s6 9684; SI-NEXT: s_mov_b32 s5, s6 9685; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 9686; SI-NEXT: s_waitcnt vmcnt(0) 9687; SI-NEXT: buffer_wbinvl1 9688; SI-NEXT: s_waitcnt expcnt(0) 9689; SI-NEXT: s_setpc_b64 s[30:31] 9690; 9691; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: 9692; VI: ; %bb.0: 9693; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9694; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9695; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9696; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] 9697; VI-NEXT: s_waitcnt vmcnt(0) 9698; VI-NEXT: buffer_wbinvl1_vol 9699; VI-NEXT: s_setpc_b64 s[30:31] 9700; 9701; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: 9702; GFX9: ; %bb.0: 9703; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9704; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 9705; GFX9-NEXT: s_waitcnt vmcnt(0) 9706; GFX9-NEXT: buffer_wbinvl1_vol 9707; GFX9-NEXT: s_setpc_b64 s[30:31] 9708 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9709 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 9710 ret void 9711} 9712 9713define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 9714; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: 9715; SI: ; %bb.0: 9716; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9717; SI-NEXT: s_mov_b32 s6, 0 9718; SI-NEXT: s_mov_b32 s7, 0xf000 9719; SI-NEXT: s_mov_b32 s4, s6 9720; SI-NEXT: s_mov_b32 s5, s6 9721; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 9722; SI-NEXT: s_waitcnt vmcnt(0) 9723; SI-NEXT: buffer_wbinvl1 9724; SI-NEXT: v_mov_b32_e32 v0, v2 9725; SI-NEXT: v_mov_b32_e32 v1, v3 9726; SI-NEXT: s_waitcnt expcnt(0) 9727; SI-NEXT: s_setpc_b64 s[30:31] 9728; 9729; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: 9730; VI: ; %bb.0: 9731; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9732; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9733; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9734; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc 9735; VI-NEXT: s_waitcnt vmcnt(0) 9736; VI-NEXT: buffer_wbinvl1_vol 9737; VI-NEXT: s_setpc_b64 s[30:31] 9738; 9739; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: 9740; GFX9: ; %bb.0: 9741; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9742; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 9743; GFX9-NEXT: s_waitcnt vmcnt(0) 9744; GFX9-NEXT: buffer_wbinvl1_vol 9745; GFX9-NEXT: s_setpc_b64 s[30:31] 9746 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9747 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 9748 ret i64 %result 9749} 9750 9751; --------------------------------------------------------------------- 9752; atomicrmw udec_wrap 9753; --------------------------------------------------------------------- 9754 9755define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { 9756; SI-LABEL: global_atomic_udec_wrap_i64_noret: 9757; SI: ; %bb.0: 9758; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9759; SI-NEXT: s_mov_b32 s6, 0 9760; SI-NEXT: s_mov_b32 s7, 0xf000 9761; SI-NEXT: s_mov_b32 s4, s6 9762; SI-NEXT: s_mov_b32 s5, s6 9763; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 9764; SI-NEXT: s_waitcnt vmcnt(0) 9765; SI-NEXT: buffer_wbinvl1 9766; SI-NEXT: s_waitcnt expcnt(0) 9767; SI-NEXT: s_setpc_b64 s[30:31] 9768; 9769; VI-LABEL: global_atomic_udec_wrap_i64_noret: 9770; VI: ; %bb.0: 9771; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9772; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] 9773; VI-NEXT: s_waitcnt vmcnt(0) 9774; VI-NEXT: buffer_wbinvl1_vol 9775; VI-NEXT: s_setpc_b64 s[30:31] 9776; 9777; GFX9-LABEL: global_atomic_udec_wrap_i64_noret: 9778; GFX9: ; %bb.0: 9779; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9780; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off 9781; GFX9-NEXT: s_waitcnt vmcnt(0) 9782; GFX9-NEXT: buffer_wbinvl1_vol 9783; GFX9-NEXT: s_setpc_b64 s[30:31] 9784 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9785 ret void 9786} 9787 9788define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { 9789; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset: 9790; SI: ; %bb.0: 9791; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9792; SI-NEXT: s_mov_b32 s6, 0 9793; SI-NEXT: s_mov_b32 s7, 0xf000 9794; SI-NEXT: s_mov_b32 s4, s6 9795; SI-NEXT: s_mov_b32 s5, s6 9796; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 9797; SI-NEXT: s_waitcnt vmcnt(0) 9798; SI-NEXT: buffer_wbinvl1 9799; SI-NEXT: s_waitcnt expcnt(0) 9800; SI-NEXT: s_setpc_b64 s[30:31] 9801; 9802; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset: 9803; VI: ; %bb.0: 9804; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9805; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9806; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9807; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] 9808; VI-NEXT: s_waitcnt vmcnt(0) 9809; VI-NEXT: buffer_wbinvl1_vol 9810; VI-NEXT: s_setpc_b64 s[30:31] 9811; 9812; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset: 9813; GFX9: ; %bb.0: 9814; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9815; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 9816; GFX9-NEXT: s_waitcnt vmcnt(0) 9817; GFX9-NEXT: buffer_wbinvl1_vol 9818; GFX9-NEXT: s_setpc_b64 s[30:31] 9819 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9820 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst 9821 ret void 9822} 9823 9824define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { 9825; SI-LABEL: global_atomic_udec_wrap_i64_ret: 9826; SI: ; %bb.0: 9827; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9828; SI-NEXT: s_mov_b32 s6, 0 9829; SI-NEXT: s_mov_b32 s7, 0xf000 9830; SI-NEXT: s_mov_b32 s4, s6 9831; SI-NEXT: s_mov_b32 s5, s6 9832; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 9833; SI-NEXT: s_waitcnt vmcnt(0) 9834; SI-NEXT: buffer_wbinvl1 9835; SI-NEXT: v_mov_b32_e32 v0, v2 9836; SI-NEXT: v_mov_b32_e32 v1, v3 9837; SI-NEXT: s_waitcnt expcnt(0) 9838; SI-NEXT: s_setpc_b64 s[30:31] 9839; 9840; VI-LABEL: global_atomic_udec_wrap_i64_ret: 9841; VI: ; %bb.0: 9842; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9843; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc 9844; VI-NEXT: s_waitcnt vmcnt(0) 9845; VI-NEXT: buffer_wbinvl1_vol 9846; VI-NEXT: s_setpc_b64 s[30:31] 9847; 9848; GFX9-LABEL: global_atomic_udec_wrap_i64_ret: 9849; GFX9: ; %bb.0: 9850; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9851; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc 9852; GFX9-NEXT: s_waitcnt vmcnt(0) 9853; GFX9-NEXT: buffer_wbinvl1_vol 9854; GFX9-NEXT: s_setpc_b64 s[30:31] 9855 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9856 ret i64 %result 9857} 9858 9859define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { 9860; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset: 9861; SI: ; %bb.0: 9862; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9863; SI-NEXT: s_mov_b32 s6, 0 9864; SI-NEXT: s_mov_b32 s7, 0xf000 9865; SI-NEXT: s_mov_b32 s4, s6 9866; SI-NEXT: s_mov_b32 s5, s6 9867; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 9868; SI-NEXT: s_waitcnt vmcnt(0) 9869; SI-NEXT: buffer_wbinvl1 9870; SI-NEXT: v_mov_b32_e32 v0, v2 9871; SI-NEXT: v_mov_b32_e32 v1, v3 9872; SI-NEXT: s_waitcnt expcnt(0) 9873; SI-NEXT: s_setpc_b64 s[30:31] 9874; 9875; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset: 9876; VI: ; %bb.0: 9877; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9878; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 9879; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9880; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc 9881; VI-NEXT: s_waitcnt vmcnt(0) 9882; VI-NEXT: buffer_wbinvl1_vol 9883; VI-NEXT: s_setpc_b64 s[30:31] 9884; 9885; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset: 9886; GFX9: ; %bb.0: 9887; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9888; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 9889; GFX9-NEXT: s_waitcnt vmcnt(0) 9890; GFX9-NEXT: buffer_wbinvl1_vol 9891; GFX9-NEXT: s_setpc_b64 s[30:31] 9892 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 9893 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst 9894 ret i64 %result 9895} 9896 9897define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 9898; SI-LABEL: global_atomic_udec_wrap_i64_noret_scalar: 9899; SI: ; %bb.0: 9900; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9901; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9902; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 9903; SI-NEXT: s_mov_b64 exec, s[34:35] 9904; SI-NEXT: s_waitcnt expcnt(0) 9905; SI-NEXT: v_writelane_b32 v2, s6, 0 9906; SI-NEXT: v_writelane_b32 v2, s7, 1 9907; SI-NEXT: s_mov_b32 s34, s7 9908; SI-NEXT: s_mov_b32 s35, s6 9909; SI-NEXT: s_mov_b32 s7, 0xf000 9910; SI-NEXT: s_mov_b32 s6, -1 9911; SI-NEXT: v_mov_b32_e32 v0, s35 9912; SI-NEXT: v_mov_b32_e32 v1, s34 9913; SI-NEXT: s_waitcnt vmcnt(0) 9914; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 9915; SI-NEXT: s_waitcnt vmcnt(0) 9916; SI-NEXT: buffer_wbinvl1 9917; SI-NEXT: v_readlane_b32 s7, v2, 1 9918; SI-NEXT: v_readlane_b32 s6, v2, 0 9919; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9920; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 9921; SI-NEXT: s_mov_b64 exec, s[34:35] 9922; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9923; SI-NEXT: s_setpc_b64 s[30:31] 9924; 9925; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar: 9926; VI: ; %bb.0: 9927; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9928; VI-NEXT: v_mov_b32_e32 v0, s6 9929; VI-NEXT: v_mov_b32_e32 v1, s7 9930; VI-NEXT: v_mov_b32_e32 v2, s4 9931; VI-NEXT: v_mov_b32_e32 v3, s5 9932; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] 9933; VI-NEXT: s_waitcnt vmcnt(0) 9934; VI-NEXT: buffer_wbinvl1_vol 9935; VI-NEXT: s_setpc_b64 s[30:31] 9936; 9937; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar: 9938; GFX9: ; %bb.0: 9939; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9940; GFX9-NEXT: v_mov_b32_e32 v0, s6 9941; GFX9-NEXT: v_mov_b32_e32 v1, s7 9942; GFX9-NEXT: v_mov_b32_e32 v2, 0 9943; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] 9944; GFX9-NEXT: s_waitcnt vmcnt(0) 9945; GFX9-NEXT: buffer_wbinvl1_vol 9946; GFX9-NEXT: s_setpc_b64 s[30:31] 9947 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 9948 ret void 9949} 9950 9951define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 9952; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: 9953; SI: ; %bb.0: 9954; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9955; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9956; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 9957; SI-NEXT: s_mov_b64 exec, s[34:35] 9958; SI-NEXT: s_waitcnt expcnt(0) 9959; SI-NEXT: v_writelane_b32 v2, s6, 0 9960; SI-NEXT: v_writelane_b32 v2, s7, 1 9961; SI-NEXT: v_mov_b32_e32 v0, s6 9962; SI-NEXT: v_mov_b32_e32 v1, s7 9963; SI-NEXT: s_mov_b32 s7, 0xf000 9964; SI-NEXT: s_mov_b32 s6, -1 9965; SI-NEXT: s_waitcnt vmcnt(0) 9966; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 9967; SI-NEXT: s_waitcnt vmcnt(0) 9968; SI-NEXT: buffer_wbinvl1 9969; SI-NEXT: v_readlane_b32 s7, v2, 1 9970; SI-NEXT: v_readlane_b32 s6, v2, 0 9971; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 9972; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 9973; SI-NEXT: s_mov_b64 exec, s[34:35] 9974; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9975; SI-NEXT: s_setpc_b64 s[30:31] 9976; 9977; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: 9978; VI: ; %bb.0: 9979; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9980; VI-NEXT: s_add_u32 s34, s4, 32 9981; VI-NEXT: s_addc_u32 s35, s5, 0 9982; VI-NEXT: v_mov_b32_e32 v2, s34 9983; VI-NEXT: v_mov_b32_e32 v0, s6 9984; VI-NEXT: v_mov_b32_e32 v1, s7 9985; VI-NEXT: v_mov_b32_e32 v3, s35 9986; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] 9987; VI-NEXT: s_waitcnt vmcnt(0) 9988; VI-NEXT: buffer_wbinvl1_vol 9989; VI-NEXT: s_setpc_b64 s[30:31] 9990; 9991; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: 9992; GFX9: ; %bb.0: 9993; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9994; GFX9-NEXT: v_mov_b32_e32 v0, s6 9995; GFX9-NEXT: v_mov_b32_e32 v1, s7 9996; GFX9-NEXT: v_mov_b32_e32 v2, 0 9997; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 9998; GFX9-NEXT: s_waitcnt vmcnt(0) 9999; GFX9-NEXT: buffer_wbinvl1_vol 10000; GFX9-NEXT: s_setpc_b64 s[30:31] 10001 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 10002 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst 10003 ret void 10004} 10005 10006define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) { 10007; SI-LABEL: global_atomic_udec_wrap_i64_ret_scalar: 10008; SI: ; %bb.0: 10009; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10010; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 10011; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 10012; SI-NEXT: s_mov_b64 exec, s[34:35] 10013; SI-NEXT: s_waitcnt expcnt(0) 10014; SI-NEXT: v_writelane_b32 v2, s6, 0 10015; SI-NEXT: v_writelane_b32 v2, s7, 1 10016; SI-NEXT: s_mov_b32 s34, s7 10017; SI-NEXT: s_mov_b32 s35, s6 10018; SI-NEXT: s_mov_b32 s7, 0xf000 10019; SI-NEXT: s_mov_b32 s6, -1 10020; SI-NEXT: v_mov_b32_e32 v0, s35 10021; SI-NEXT: v_mov_b32_e32 v1, s34 10022; SI-NEXT: s_waitcnt vmcnt(0) 10023; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc 10024; SI-NEXT: s_waitcnt vmcnt(0) 10025; SI-NEXT: buffer_wbinvl1 10026; SI-NEXT: v_readlane_b32 s7, v2, 1 10027; SI-NEXT: v_readlane_b32 s6, v2, 0 10028; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 10029; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 10030; SI-NEXT: s_mov_b64 exec, s[34:35] 10031; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 10032; SI-NEXT: s_setpc_b64 s[30:31] 10033; 10034; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar: 10035; VI: ; %bb.0: 10036; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10037; VI-NEXT: v_mov_b32_e32 v0, s6 10038; VI-NEXT: v_mov_b32_e32 v1, s7 10039; VI-NEXT: v_mov_b32_e32 v2, s4 10040; VI-NEXT: v_mov_b32_e32 v3, s5 10041; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc 10042; VI-NEXT: s_waitcnt vmcnt(0) 10043; VI-NEXT: buffer_wbinvl1_vol 10044; VI-NEXT: s_setpc_b64 s[30:31] 10045; 10046; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar: 10047; GFX9: ; %bb.0: 10048; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10049; GFX9-NEXT: v_mov_b32_e32 v0, s6 10050; GFX9-NEXT: v_mov_b32_e32 v1, s7 10051; GFX9-NEXT: v_mov_b32_e32 v2, 0 10052; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc 10053; GFX9-NEXT: s_waitcnt vmcnt(0) 10054; GFX9-NEXT: buffer_wbinvl1_vol 10055; GFX9-NEXT: s_setpc_b64 s[30:31] 10056 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst 10057 ret i64 %result 10058} 10059 10060define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) { 10061; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: 10062; SI: ; %bb.0: 10063; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10064; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 10065; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill 10066; SI-NEXT: s_mov_b64 exec, s[34:35] 10067; SI-NEXT: s_waitcnt expcnt(0) 10068; SI-NEXT: v_writelane_b32 v2, s6, 0 10069; SI-NEXT: v_writelane_b32 v2, s7, 1 10070; SI-NEXT: v_mov_b32_e32 v0, s6 10071; SI-NEXT: v_mov_b32_e32 v1, s7 10072; SI-NEXT: s_mov_b32 s7, 0xf000 10073; SI-NEXT: s_mov_b32 s6, -1 10074; SI-NEXT: s_waitcnt vmcnt(0) 10075; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc 10076; SI-NEXT: s_waitcnt vmcnt(0) 10077; SI-NEXT: buffer_wbinvl1 10078; SI-NEXT: v_readlane_b32 s7, v2, 1 10079; SI-NEXT: v_readlane_b32 s6, v2, 0 10080; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 10081; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload 10082; SI-NEXT: s_mov_b64 exec, s[34:35] 10083; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 10084; SI-NEXT: s_setpc_b64 s[30:31] 10085; 10086; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: 10087; VI: ; %bb.0: 10088; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10089; VI-NEXT: s_add_u32 s34, s4, 32 10090; VI-NEXT: s_addc_u32 s35, s5, 0 10091; VI-NEXT: v_mov_b32_e32 v2, s34 10092; VI-NEXT: v_mov_b32_e32 v0, s6 10093; VI-NEXT: v_mov_b32_e32 v1, s7 10094; VI-NEXT: v_mov_b32_e32 v3, s35 10095; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc 10096; VI-NEXT: s_waitcnt vmcnt(0) 10097; VI-NEXT: buffer_wbinvl1_vol 10098; VI-NEXT: s_setpc_b64 s[30:31] 10099; 10100; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: 10101; GFX9: ; %bb.0: 10102; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10103; GFX9-NEXT: v_mov_b32_e32 v0, s6 10104; GFX9-NEXT: v_mov_b32_e32 v1, s7 10105; GFX9-NEXT: v_mov_b32_e32 v2, 0 10106; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc 10107; GFX9-NEXT: s_waitcnt vmcnt(0) 10108; GFX9-NEXT: buffer_wbinvl1_vol 10109; GFX9-NEXT: s_setpc_b64 s[30:31] 10110 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 10111 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst 10112 ret i64 %result 10113} 10114 10115define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 10116; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: 10117; SI: ; %bb.0: 10118; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10119; SI-NEXT: s_mov_b32 s6, 0 10120; SI-NEXT: s_mov_b32 s7, 0xf000 10121; SI-NEXT: s_mov_b32 s4, s6 10122; SI-NEXT: s_mov_b32 s5, s6 10123; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 10124; SI-NEXT: s_waitcnt vmcnt(0) 10125; SI-NEXT: buffer_wbinvl1 10126; SI-NEXT: s_waitcnt expcnt(0) 10127; SI-NEXT: s_setpc_b64 s[30:31] 10128; 10129; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: 10130; VI: ; %bb.0: 10131; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10132; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 10133; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10134; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] 10135; VI-NEXT: s_waitcnt vmcnt(0) 10136; VI-NEXT: buffer_wbinvl1_vol 10137; VI-NEXT: s_setpc_b64 s[30:31] 10138; 10139; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: 10140; GFX9: ; %bb.0: 10141; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10142; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 10143; GFX9-NEXT: s_waitcnt vmcnt(0) 10144; GFX9-NEXT: buffer_wbinvl1_vol 10145; GFX9-NEXT: s_setpc_b64 s[30:31] 10146 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 10147 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 10148 ret void 10149} 10150 10151define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) { 10152; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: 10153; SI: ; %bb.0: 10154; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10155; SI-NEXT: s_mov_b32 s6, 0 10156; SI-NEXT: s_mov_b32 s7, 0xf000 10157; SI-NEXT: s_mov_b32 s4, s6 10158; SI-NEXT: s_mov_b32 s5, s6 10159; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc 10160; SI-NEXT: s_waitcnt vmcnt(0) 10161; SI-NEXT: buffer_wbinvl1 10162; SI-NEXT: v_mov_b32_e32 v0, v2 10163; SI-NEXT: v_mov_b32_e32 v1, v3 10164; SI-NEXT: s_waitcnt expcnt(0) 10165; SI-NEXT: s_setpc_b64 s[30:31] 10166; 10167; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: 10168; VI: ; %bb.0: 10169; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10170; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 10171; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10172; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc 10173; VI-NEXT: s_waitcnt vmcnt(0) 10174; VI-NEXT: buffer_wbinvl1_vol 10175; VI-NEXT: s_setpc_b64 s[30:31] 10176; 10177; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: 10178; GFX9: ; %bb.0: 10179; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10180; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc 10181; GFX9-NEXT: s_waitcnt vmcnt(0) 10182; GFX9-NEXT: buffer_wbinvl1_vol 10183; GFX9-NEXT: s_setpc_b64 s[30:31] 10184 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 10185 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 10186 ret i64 %result 10187} 10188 10189!0 = !{} 10190