1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s 5 6; --------------------------------------------------------------------- 7; atomicrmw xchg 8; --------------------------------------------------------------------- 9 10define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { 11; GCN1-LABEL: flat_atomic_xchg_i32_noret: 12; GCN1: ; %bb.0: 13; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; GCN1-NEXT: flat_atomic_swap v[0:1], v2 15; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16; GCN1-NEXT: buffer_wbinvl1_vol 17; GCN1-NEXT: s_setpc_b64 s[30:31] 18; 19; GCN2-LABEL: flat_atomic_xchg_i32_noret: 20; GCN2: ; %bb.0: 21; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GCN2-NEXT: flat_atomic_swap v[0:1], v2 23; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 24; GCN2-NEXT: buffer_wbinvl1_vol 25; GCN2-NEXT: s_setpc_b64 s[30:31] 26; 27; GCN3-LABEL: flat_atomic_xchg_i32_noret: 28; GCN3: ; %bb.0: 29; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GCN3-NEXT: flat_atomic_swap v[0:1], v2 31; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 32; GCN3-NEXT: buffer_wbinvl1_vol 33; GCN3-NEXT: s_setpc_b64 s[30:31] 34 %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst 35 ret void 36} 37 38define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) { 39; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset: 40; GCN1: ; %bb.0: 41; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 42; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 43; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 44; GCN1-NEXT: flat_atomic_swap v[0:1], v2 45; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 46; GCN1-NEXT: buffer_wbinvl1_vol 47; GCN1-NEXT: s_setpc_b64 s[30:31] 48; 49; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset: 50; GCN2: ; %bb.0: 51; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 53; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 54; GCN2-NEXT: flat_atomic_swap v[0:1], v2 55; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 56; GCN2-NEXT: buffer_wbinvl1_vol 57; GCN2-NEXT: s_setpc_b64 s[30:31] 58; 59; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset: 60; GCN3: ; %bb.0: 61; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 63; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 64; GCN3-NEXT: buffer_wbinvl1_vol 65; GCN3-NEXT: s_setpc_b64 s[30:31] 66 %gep = getelementptr i32, ptr %out, i32 4 67 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst 68 ret void 69} 70 71define i32 @flat_atomic_xchg_i32_ret(ptr %ptr, i32 %in) { 72; GCN1-LABEL: flat_atomic_xchg_i32_ret: 73; GCN1: ; %bb.0: 74; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 76; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 77; GCN1-NEXT: buffer_wbinvl1_vol 78; GCN1-NEXT: s_setpc_b64 s[30:31] 79; 80; GCN2-LABEL: flat_atomic_xchg_i32_ret: 81; GCN2: ; %bb.0: 82; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 84; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 85; GCN2-NEXT: buffer_wbinvl1_vol 86; GCN2-NEXT: s_setpc_b64 s[30:31] 87; 88; GCN3-LABEL: flat_atomic_xchg_i32_ret: 89; GCN3: ; %bb.0: 90; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 91; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 92; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 93; GCN3-NEXT: buffer_wbinvl1_vol 94; GCN3-NEXT: s_setpc_b64 s[30:31] 95 %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst 96 ret i32 %result 97} 98 99define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) { 100; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset: 101; GCN1: ; %bb.0: 102; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 103; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 104; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 105; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 106; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 107; GCN1-NEXT: buffer_wbinvl1_vol 108; GCN1-NEXT: s_setpc_b64 s[30:31] 109; 110; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset: 111; GCN2: ; %bb.0: 112; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 114; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 115; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 116; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 117; GCN2-NEXT: buffer_wbinvl1_vol 118; GCN2-NEXT: s_setpc_b64 s[30:31] 119; 120; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset: 121; GCN3: ; %bb.0: 122; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc 124; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 125; GCN3-NEXT: buffer_wbinvl1_vol 126; GCN3-NEXT: s_setpc_b64 s[30:31] 127 %gep = getelementptr i32, ptr %out, i32 4 128 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst 129 ret i32 %result 130} 131 132define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 133; GCN1-LABEL: flat_atomic_xchg_i32_noret_scalar: 134; GCN1: ; %bb.0: 135; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 136; GCN1-NEXT: v_mov_b32_e32 v0, s4 137; GCN1-NEXT: v_mov_b32_e32 v1, s5 138; GCN1-NEXT: v_mov_b32_e32 v2, s6 139; GCN1-NEXT: flat_atomic_swap v[0:1], v2 140; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 141; GCN1-NEXT: buffer_wbinvl1_vol 142; GCN1-NEXT: s_setpc_b64 s[30:31] 143; 144; GCN2-LABEL: flat_atomic_xchg_i32_noret_scalar: 145; GCN2: ; %bb.0: 146; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GCN2-NEXT: v_mov_b32_e32 v0, s4 148; GCN2-NEXT: v_mov_b32_e32 v1, s5 149; GCN2-NEXT: v_mov_b32_e32 v2, s6 150; GCN2-NEXT: flat_atomic_swap v[0:1], v2 151; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GCN2-NEXT: buffer_wbinvl1_vol 153; GCN2-NEXT: s_setpc_b64 s[30:31] 154; 155; GCN3-LABEL: flat_atomic_xchg_i32_noret_scalar: 156; GCN3: ; %bb.0: 157; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 158; GCN3-NEXT: v_mov_b32_e32 v0, s4 159; GCN3-NEXT: v_mov_b32_e32 v1, s5 160; GCN3-NEXT: v_mov_b32_e32 v2, s6 161; GCN3-NEXT: flat_atomic_swap v[0:1], v2 162; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 163; GCN3-NEXT: buffer_wbinvl1_vol 164; GCN3-NEXT: s_setpc_b64 s[30:31] 165 %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst 166 ret void 167} 168 169define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 170; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset_scalar: 171; GCN1: ; %bb.0: 172; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GCN1-NEXT: s_add_u32 s34, s4, 16 174; GCN1-NEXT: s_addc_u32 s35, s5, 0 175; GCN1-NEXT: v_mov_b32_e32 v0, s34 176; GCN1-NEXT: v_mov_b32_e32 v1, s35 177; GCN1-NEXT: v_mov_b32_e32 v2, s6 178; GCN1-NEXT: flat_atomic_swap v[0:1], v2 179; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 180; GCN1-NEXT: buffer_wbinvl1_vol 181; GCN1-NEXT: s_setpc_b64 s[30:31] 182; 183; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset_scalar: 184; GCN2: ; %bb.0: 185; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; GCN2-NEXT: s_add_u32 s34, s4, 16 187; GCN2-NEXT: s_addc_u32 s35, s5, 0 188; GCN2-NEXT: v_mov_b32_e32 v0, s34 189; GCN2-NEXT: v_mov_b32_e32 v1, s35 190; GCN2-NEXT: v_mov_b32_e32 v2, s6 191; GCN2-NEXT: flat_atomic_swap v[0:1], v2 192; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 193; GCN2-NEXT: buffer_wbinvl1_vol 194; GCN2-NEXT: s_setpc_b64 s[30:31] 195; 196; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset_scalar: 197; GCN3: ; %bb.0: 198; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GCN3-NEXT: v_mov_b32_e32 v0, s4 200; GCN3-NEXT: v_mov_b32_e32 v1, s5 201; GCN3-NEXT: v_mov_b32_e32 v2, s6 202; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 203; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 204; GCN3-NEXT: buffer_wbinvl1_vol 205; GCN3-NEXT: s_setpc_b64 s[30:31] 206 %gep = getelementptr i32, ptr %out, i32 4 207 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst 208 ret void 209} 210 211define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 212; GCN1-LABEL: flat_atomic_xchg_i32_ret_scalar: 213; GCN1: ; %bb.0: 214; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; GCN1-NEXT: v_mov_b32_e32 v0, s4 216; GCN1-NEXT: v_mov_b32_e32 v1, s5 217; GCN1-NEXT: v_mov_b32_e32 v2, s6 218; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 219; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 220; GCN1-NEXT: buffer_wbinvl1_vol 221; GCN1-NEXT: s_setpc_b64 s[30:31] 222; 223; GCN2-LABEL: flat_atomic_xchg_i32_ret_scalar: 224; GCN2: ; %bb.0: 225; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 226; GCN2-NEXT: v_mov_b32_e32 v0, s4 227; GCN2-NEXT: v_mov_b32_e32 v1, s5 228; GCN2-NEXT: v_mov_b32_e32 v2, s6 229; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 230; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 231; GCN2-NEXT: buffer_wbinvl1_vol 232; GCN2-NEXT: s_setpc_b64 s[30:31] 233; 234; GCN3-LABEL: flat_atomic_xchg_i32_ret_scalar: 235; GCN3: ; %bb.0: 236; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 237; GCN3-NEXT: v_mov_b32_e32 v0, s4 238; GCN3-NEXT: v_mov_b32_e32 v1, s5 239; GCN3-NEXT: v_mov_b32_e32 v2, s6 240; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 241; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 242; GCN3-NEXT: buffer_wbinvl1_vol 243; GCN3-NEXT: s_setpc_b64 s[30:31] 244 %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst 245 ret i32 %result 246} 247 248define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 249; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset_scalar: 250; GCN1: ; %bb.0: 251; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GCN1-NEXT: s_add_u32 s34, s4, 16 253; GCN1-NEXT: s_addc_u32 s35, s5, 0 254; GCN1-NEXT: v_mov_b32_e32 v0, s34 255; GCN1-NEXT: v_mov_b32_e32 v1, s35 256; GCN1-NEXT: v_mov_b32_e32 v2, s6 257; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 258; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 259; GCN1-NEXT: buffer_wbinvl1_vol 260; GCN1-NEXT: s_setpc_b64 s[30:31] 261; 262; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset_scalar: 263; GCN2: ; %bb.0: 264; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; GCN2-NEXT: s_add_u32 s34, s4, 16 266; GCN2-NEXT: s_addc_u32 s35, s5, 0 267; GCN2-NEXT: v_mov_b32_e32 v0, s34 268; GCN2-NEXT: v_mov_b32_e32 v1, s35 269; GCN2-NEXT: v_mov_b32_e32 v2, s6 270; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 271; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 272; GCN2-NEXT: buffer_wbinvl1_vol 273; GCN2-NEXT: s_setpc_b64 s[30:31] 274; 275; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset_scalar: 276; GCN3: ; %bb.0: 277; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 278; GCN3-NEXT: v_mov_b32_e32 v0, s4 279; GCN3-NEXT: v_mov_b32_e32 v1, s5 280; GCN3-NEXT: v_mov_b32_e32 v2, s6 281; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc 282; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GCN3-NEXT: buffer_wbinvl1_vol 284; GCN3-NEXT: s_setpc_b64 s[30:31] 285 %gep = getelementptr i32, ptr %out, i32 4 286 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst 287 ret i32 %result 288} 289 290define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 291; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory: 292; GCN1: ; %bb.0: 293; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 295; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 296; GCN1-NEXT: flat_atomic_swap v[0:1], v2 297; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GCN1-NEXT: buffer_wbinvl1_vol 299; GCN1-NEXT: s_setpc_b64 s[30:31] 300; 301; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory: 302; GCN2: ; %bb.0: 303; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 304; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 305; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 306; GCN2-NEXT: flat_atomic_swap v[0:1], v2 307; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 308; GCN2-NEXT: buffer_wbinvl1_vol 309; GCN2-NEXT: s_setpc_b64 s[30:31] 310; 311; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory: 312; GCN3: ; %bb.0: 313; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 314; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 315; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 316; GCN3-NEXT: buffer_wbinvl1_vol 317; GCN3-NEXT: s_setpc_b64 s[30:31] 318 %gep = getelementptr i32, ptr %out, i64 4 319 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 320 ret void 321} 322 323define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 324; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory: 325; GCN1: ; %bb.0: 326; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 328; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 329; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 330; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 331; GCN1-NEXT: buffer_wbinvl1_vol 332; GCN1-NEXT: s_setpc_b64 s[30:31] 333; 334; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory: 335; GCN2: ; %bb.0: 336; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 337; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 338; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 339; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 340; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 341; GCN2-NEXT: buffer_wbinvl1_vol 342; GCN2-NEXT: s_setpc_b64 s[30:31] 343; 344; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory: 345; GCN3: ; %bb.0: 346; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 347; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc 348; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 349; GCN3-NEXT: buffer_wbinvl1_vol 350; GCN3-NEXT: s_setpc_b64 s[30:31] 351 %gep = getelementptr i32, ptr %out, i64 4 352 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 353 ret i32 %result 354} 355 356; --------------------------------------------------------------------- 357; atomicrmw xchg f32 358; --------------------------------------------------------------------- 359 360define void @flat_atomic_xchg_f32_noret(ptr %ptr, float %in) { 361; GCN1-LABEL: flat_atomic_xchg_f32_noret: 362; GCN1: ; %bb.0: 363; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GCN1-NEXT: flat_atomic_swap v[0:1], v2 365; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 366; GCN1-NEXT: buffer_wbinvl1_vol 367; GCN1-NEXT: s_setpc_b64 s[30:31] 368; 369; GCN2-LABEL: flat_atomic_xchg_f32_noret: 370; GCN2: ; %bb.0: 371; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 372; GCN2-NEXT: flat_atomic_swap v[0:1], v2 373; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 374; GCN2-NEXT: buffer_wbinvl1_vol 375; GCN2-NEXT: s_setpc_b64 s[30:31] 376; 377; GCN3-LABEL: flat_atomic_xchg_f32_noret: 378; GCN3: ; %bb.0: 379; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 380; GCN3-NEXT: flat_atomic_swap v[0:1], v2 381; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 382; GCN3-NEXT: buffer_wbinvl1_vol 383; GCN3-NEXT: s_setpc_b64 s[30:31] 384 %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst 385 ret void 386} 387 388define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) { 389; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset: 390; GCN1: ; %bb.0: 391; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 392; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 393; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 394; GCN1-NEXT: flat_atomic_swap v[0:1], v2 395; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 396; GCN1-NEXT: buffer_wbinvl1_vol 397; GCN1-NEXT: s_setpc_b64 s[30:31] 398; 399; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset: 400; GCN2: ; %bb.0: 401; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 402; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 403; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 404; GCN2-NEXT: flat_atomic_swap v[0:1], v2 405; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 406; GCN2-NEXT: buffer_wbinvl1_vol 407; GCN2-NEXT: s_setpc_b64 s[30:31] 408; 409; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset: 410; GCN3: ; %bb.0: 411; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 412; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 413; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 414; GCN3-NEXT: buffer_wbinvl1_vol 415; GCN3-NEXT: s_setpc_b64 s[30:31] 416 %gep = getelementptr float, ptr %out, i32 4 417 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst 418 ret void 419} 420 421define float @flat_atomic_xchg_f32_ret(ptr %ptr, float %in) { 422; GCN1-LABEL: flat_atomic_xchg_f32_ret: 423; GCN1: ; %bb.0: 424; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 425; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 426; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 427; GCN1-NEXT: buffer_wbinvl1_vol 428; GCN1-NEXT: s_setpc_b64 s[30:31] 429; 430; GCN2-LABEL: flat_atomic_xchg_f32_ret: 431; GCN2: ; %bb.0: 432; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 433; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 434; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 435; GCN2-NEXT: buffer_wbinvl1_vol 436; GCN2-NEXT: s_setpc_b64 s[30:31] 437; 438; GCN3-LABEL: flat_atomic_xchg_f32_ret: 439; GCN3: ; %bb.0: 440; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 441; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 442; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 443; GCN3-NEXT: buffer_wbinvl1_vol 444; GCN3-NEXT: s_setpc_b64 s[30:31] 445 %result = atomicrmw xchg ptr %ptr, float %in seq_cst 446 ret float %result 447} 448 449define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) { 450; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset: 451; GCN1: ; %bb.0: 452; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 453; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 454; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 455; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 456; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 457; GCN1-NEXT: buffer_wbinvl1_vol 458; GCN1-NEXT: s_setpc_b64 s[30:31] 459; 460; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset: 461; GCN2: ; %bb.0: 462; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 463; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 464; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 465; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 466; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 467; GCN2-NEXT: buffer_wbinvl1_vol 468; GCN2-NEXT: s_setpc_b64 s[30:31] 469; 470; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset: 471; GCN3: ; %bb.0: 472; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc 474; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 475; GCN3-NEXT: buffer_wbinvl1_vol 476; GCN3-NEXT: s_setpc_b64 s[30:31] 477 %gep = getelementptr float, ptr %out, i32 4 478 %result = atomicrmw xchg ptr %gep, float %in seq_cst 479 ret float %result 480} 481 482define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float inreg %in) { 483; GCN1-LABEL: flat_atomic_xchg_f32_noret_scalar: 484; GCN1: ; %bb.0: 485; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 486; GCN1-NEXT: v_mov_b32_e32 v0, s4 487; GCN1-NEXT: v_mov_b32_e32 v1, s5 488; GCN1-NEXT: v_mov_b32_e32 v2, s6 489; GCN1-NEXT: flat_atomic_swap v[0:1], v2 490; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 491; GCN1-NEXT: buffer_wbinvl1_vol 492; GCN1-NEXT: s_setpc_b64 s[30:31] 493; 494; GCN2-LABEL: flat_atomic_xchg_f32_noret_scalar: 495; GCN2: ; %bb.0: 496; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 497; GCN2-NEXT: v_mov_b32_e32 v0, s4 498; GCN2-NEXT: v_mov_b32_e32 v1, s5 499; GCN2-NEXT: v_mov_b32_e32 v2, s6 500; GCN2-NEXT: flat_atomic_swap v[0:1], v2 501; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 502; GCN2-NEXT: buffer_wbinvl1_vol 503; GCN2-NEXT: s_setpc_b64 s[30:31] 504; 505; GCN3-LABEL: flat_atomic_xchg_f32_noret_scalar: 506; GCN3: ; %bb.0: 507; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 508; GCN3-NEXT: v_mov_b32_e32 v0, s4 509; GCN3-NEXT: v_mov_b32_e32 v1, s5 510; GCN3-NEXT: v_mov_b32_e32 v2, s6 511; GCN3-NEXT: flat_atomic_swap v[0:1], v2 512; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 513; GCN3-NEXT: buffer_wbinvl1_vol 514; GCN3-NEXT: s_setpc_b64 s[30:31] 515 %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst 516 ret void 517} 518 519define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out, float inreg %in) { 520; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset_scalar: 521; GCN1: ; %bb.0: 522; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 523; GCN1-NEXT: s_add_u32 s34, s4, 16 524; GCN1-NEXT: s_addc_u32 s35, s5, 0 525; GCN1-NEXT: v_mov_b32_e32 v0, s34 526; GCN1-NEXT: v_mov_b32_e32 v1, s35 527; GCN1-NEXT: v_mov_b32_e32 v2, s6 528; GCN1-NEXT: flat_atomic_swap v[0:1], v2 529; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 530; GCN1-NEXT: buffer_wbinvl1_vol 531; GCN1-NEXT: s_setpc_b64 s[30:31] 532; 533; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset_scalar: 534; GCN2: ; %bb.0: 535; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 536; GCN2-NEXT: s_add_u32 s34, s4, 16 537; GCN2-NEXT: s_addc_u32 s35, s5, 0 538; GCN2-NEXT: v_mov_b32_e32 v0, s34 539; GCN2-NEXT: v_mov_b32_e32 v1, s35 540; GCN2-NEXT: v_mov_b32_e32 v2, s6 541; GCN2-NEXT: flat_atomic_swap v[0:1], v2 542; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 543; GCN2-NEXT: buffer_wbinvl1_vol 544; GCN2-NEXT: s_setpc_b64 s[30:31] 545; 546; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset_scalar: 547; GCN3: ; %bb.0: 548; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 549; GCN3-NEXT: v_mov_b32_e32 v0, s4 550; GCN3-NEXT: v_mov_b32_e32 v1, s5 551; GCN3-NEXT: v_mov_b32_e32 v2, s6 552; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 553; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 554; GCN3-NEXT: buffer_wbinvl1_vol 555; GCN3-NEXT: s_setpc_b64 s[30:31] 556 %gep = getelementptr float, ptr %out, i32 4 557 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst 558 ret void 559} 560 561define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float inreg %in) { 562; GCN1-LABEL: flat_atomic_xchg_f32_ret_scalar: 563; GCN1: ; %bb.0: 564; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 565; GCN1-NEXT: v_mov_b32_e32 v0, s4 566; GCN1-NEXT: v_mov_b32_e32 v1, s5 567; GCN1-NEXT: v_mov_b32_e32 v2, s6 568; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 569; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 570; GCN1-NEXT: buffer_wbinvl1_vol 571; GCN1-NEXT: s_setpc_b64 s[30:31] 572; 573; GCN2-LABEL: flat_atomic_xchg_f32_ret_scalar: 574; GCN2: ; %bb.0: 575; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 576; GCN2-NEXT: v_mov_b32_e32 v0, s4 577; GCN2-NEXT: v_mov_b32_e32 v1, s5 578; GCN2-NEXT: v_mov_b32_e32 v2, s6 579; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 580; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 581; GCN2-NEXT: buffer_wbinvl1_vol 582; GCN2-NEXT: s_setpc_b64 s[30:31] 583; 584; GCN3-LABEL: flat_atomic_xchg_f32_ret_scalar: 585; GCN3: ; %bb.0: 586; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 587; GCN3-NEXT: v_mov_b32_e32 v0, s4 588; GCN3-NEXT: v_mov_b32_e32 v1, s5 589; GCN3-NEXT: v_mov_b32_e32 v2, s6 590; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 591; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 592; GCN3-NEXT: buffer_wbinvl1_vol 593; GCN3-NEXT: s_setpc_b64 s[30:31] 594 %result = atomicrmw xchg ptr %ptr, float %in seq_cst 595 ret float %result 596} 597 598define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, float inreg %in) { 599; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset_scalar: 600; GCN1: ; %bb.0: 601; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 602; GCN1-NEXT: s_add_u32 s34, s4, 16 603; GCN1-NEXT: s_addc_u32 s35, s5, 0 604; GCN1-NEXT: v_mov_b32_e32 v0, s34 605; GCN1-NEXT: v_mov_b32_e32 v1, s35 606; GCN1-NEXT: v_mov_b32_e32 v2, s6 607; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 608; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 609; GCN1-NEXT: buffer_wbinvl1_vol 610; GCN1-NEXT: s_setpc_b64 s[30:31] 611; 612; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset_scalar: 613; GCN2: ; %bb.0: 614; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 615; GCN2-NEXT: s_add_u32 s34, s4, 16 616; GCN2-NEXT: s_addc_u32 s35, s5, 0 617; GCN2-NEXT: v_mov_b32_e32 v0, s34 618; GCN2-NEXT: v_mov_b32_e32 v1, s35 619; GCN2-NEXT: v_mov_b32_e32 v2, s6 620; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 621; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 622; GCN2-NEXT: buffer_wbinvl1_vol 623; GCN2-NEXT: s_setpc_b64 s[30:31] 624; 625; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset_scalar: 626; GCN3: ; %bb.0: 627; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GCN3-NEXT: v_mov_b32_e32 v0, s4 629; GCN3-NEXT: v_mov_b32_e32 v1, s5 630; GCN3-NEXT: v_mov_b32_e32 v2, s6 631; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc 632; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 633; GCN3-NEXT: buffer_wbinvl1_vol 634; GCN3-NEXT: s_setpc_b64 s[30:31] 635 %gep = getelementptr float, ptr %out, i32 4 636 %result = atomicrmw xchg ptr %gep, float %in seq_cst 637 ret float %result 638} 639 640define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr %out, float %in) { 641; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory: 642; GCN1: ; %bb.0: 643; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 644; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 645; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 646; GCN1-NEXT: flat_atomic_swap v[0:1], v2 647; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 648; GCN1-NEXT: buffer_wbinvl1_vol 649; GCN1-NEXT: s_setpc_b64 s[30:31] 650; 651; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory: 652; GCN2: ; %bb.0: 653; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 654; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 655; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 656; GCN2-NEXT: flat_atomic_swap v[0:1], v2 657; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 658; GCN2-NEXT: buffer_wbinvl1_vol 659; GCN2-NEXT: s_setpc_b64 s[30:31] 660; 661; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory: 662; GCN3: ; %bb.0: 663; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 664; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 665; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 666; GCN3-NEXT: buffer_wbinvl1_vol 667; GCN3-NEXT: s_setpc_b64 s[30:31] 668 %gep = getelementptr float, ptr %out, i64 4 669 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 670 ret void 671} 672 673define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr %out, float %in) { 674; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory: 675; GCN1: ; %bb.0: 676; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 677; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 678; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 679; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 680; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 681; GCN1-NEXT: buffer_wbinvl1_vol 682; GCN1-NEXT: s_setpc_b64 s[30:31] 683; 684; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory: 685; GCN2: ; %bb.0: 686; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 687; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 688; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 689; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc 690; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 691; GCN2-NEXT: buffer_wbinvl1_vol 692; GCN2-NEXT: s_setpc_b64 s[30:31] 693; 694; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory: 695; GCN3: ; %bb.0: 696; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 697; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc 698; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GCN3-NEXT: buffer_wbinvl1_vol 700; GCN3-NEXT: s_setpc_b64 s[30:31] 701 %gep = getelementptr float, ptr %out, i64 4 702 %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 703 ret float %result 704} 705 706; --------------------------------------------------------------------- 707; atomicrmw add 708; --------------------------------------------------------------------- 709 710define void @flat_atomic_add_i32_noret(ptr %ptr, i32 %in) { 711; GCN1-LABEL: flat_atomic_add_i32_noret: 712; GCN1: ; %bb.0: 713; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 714; GCN1-NEXT: flat_atomic_add v[0:1], v2 715; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 716; GCN1-NEXT: buffer_wbinvl1_vol 717; GCN1-NEXT: s_setpc_b64 s[30:31] 718; 719; GCN2-LABEL: flat_atomic_add_i32_noret: 720; GCN2: ; %bb.0: 721; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 722; GCN2-NEXT: flat_atomic_add v[0:1], v2 723; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 724; GCN2-NEXT: buffer_wbinvl1_vol 725; GCN2-NEXT: s_setpc_b64 s[30:31] 726; 727; GCN3-LABEL: flat_atomic_add_i32_noret: 728; GCN3: ; %bb.0: 729; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 730; GCN3-NEXT: flat_atomic_add v[0:1], v2 731; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 732; GCN3-NEXT: buffer_wbinvl1_vol 733; GCN3-NEXT: s_setpc_b64 s[30:31] 734 %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst 735 ret void 736} 737 738define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) { 739; GCN1-LABEL: flat_atomic_add_i32_noret_offset: 740; GCN1: ; %bb.0: 741; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 743; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 744; GCN1-NEXT: flat_atomic_add v[0:1], v2 745; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 746; GCN1-NEXT: buffer_wbinvl1_vol 747; GCN1-NEXT: s_setpc_b64 s[30:31] 748; 749; GCN2-LABEL: flat_atomic_add_i32_noret_offset: 750; GCN2: ; %bb.0: 751; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 752; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 753; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 754; GCN2-NEXT: flat_atomic_add v[0:1], v2 755; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 756; GCN2-NEXT: buffer_wbinvl1_vol 757; GCN2-NEXT: s_setpc_b64 s[30:31] 758; 759; GCN3-LABEL: flat_atomic_add_i32_noret_offset: 760; GCN3: ; %bb.0: 761; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 762; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 763; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 764; GCN3-NEXT: buffer_wbinvl1_vol 765; GCN3-NEXT: s_setpc_b64 s[30:31] 766 %gep = getelementptr i32, ptr %out, i32 4 767 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst 768 ret void 769} 770 771define i32 @flat_atomic_add_i32_ret(ptr %ptr, i32 %in) { 772; GCN1-LABEL: flat_atomic_add_i32_ret: 773; GCN1: ; %bb.0: 774; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 775; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc 776; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 777; GCN1-NEXT: buffer_wbinvl1_vol 778; GCN1-NEXT: s_setpc_b64 s[30:31] 779; 780; GCN2-LABEL: flat_atomic_add_i32_ret: 781; GCN2: ; %bb.0: 782; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 783; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc 784; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 785; GCN2-NEXT: buffer_wbinvl1_vol 786; GCN2-NEXT: s_setpc_b64 s[30:31] 787; 788; GCN3-LABEL: flat_atomic_add_i32_ret: 789; GCN3: ; %bb.0: 790; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 791; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc 792; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 793; GCN3-NEXT: buffer_wbinvl1_vol 794; GCN3-NEXT: s_setpc_b64 s[30:31] 795 %result = atomicrmw add ptr %ptr, i32 %in seq_cst 796 ret i32 %result 797} 798 799define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) { 800; GCN1-LABEL: flat_atomic_add_i32_ret_offset: 801; GCN1: ; %bb.0: 802; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 803; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 804; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 805; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc 806; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 807; GCN1-NEXT: buffer_wbinvl1_vol 808; GCN1-NEXT: s_setpc_b64 s[30:31] 809; 810; GCN2-LABEL: flat_atomic_add_i32_ret_offset: 811; GCN2: ; %bb.0: 812; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 813; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 814; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 815; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc 816; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 817; GCN2-NEXT: buffer_wbinvl1_vol 818; GCN2-NEXT: s_setpc_b64 s[30:31] 819; 820; GCN3-LABEL: flat_atomic_add_i32_ret_offset: 821; GCN3: ; %bb.0: 822; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 823; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc 824; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 825; GCN3-NEXT: buffer_wbinvl1_vol 826; GCN3-NEXT: s_setpc_b64 s[30:31] 827 %gep = getelementptr i32, ptr %out, i32 4 828 %result = atomicrmw add ptr %gep, i32 %in seq_cst 829 ret i32 %result 830} 831 832define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 833; GCN1-LABEL: flat_atomic_add_i32_noret_scalar: 834; GCN1: ; %bb.0: 835; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 836; GCN1-NEXT: v_mov_b32_e32 v0, s4 837; GCN1-NEXT: v_mov_b32_e32 v1, s5 838; GCN1-NEXT: v_mov_b32_e32 v2, s6 839; GCN1-NEXT: flat_atomic_add v[0:1], v2 840; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 841; GCN1-NEXT: buffer_wbinvl1_vol 842; GCN1-NEXT: s_setpc_b64 s[30:31] 843; 844; GCN2-LABEL: flat_atomic_add_i32_noret_scalar: 845; GCN2: ; %bb.0: 846; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 847; GCN2-NEXT: v_mov_b32_e32 v0, s4 848; GCN2-NEXT: v_mov_b32_e32 v1, s5 849; GCN2-NEXT: v_mov_b32_e32 v2, s6 850; GCN2-NEXT: flat_atomic_add v[0:1], v2 851; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 852; GCN2-NEXT: buffer_wbinvl1_vol 853; GCN2-NEXT: s_setpc_b64 s[30:31] 854; 855; GCN3-LABEL: flat_atomic_add_i32_noret_scalar: 856; GCN3: ; %bb.0: 857; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 858; GCN3-NEXT: v_mov_b32_e32 v0, s4 859; GCN3-NEXT: v_mov_b32_e32 v1, s5 860; GCN3-NEXT: v_mov_b32_e32 v2, s6 861; GCN3-NEXT: flat_atomic_add v[0:1], v2 862; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 863; GCN3-NEXT: buffer_wbinvl1_vol 864; GCN3-NEXT: s_setpc_b64 s[30:31] 865 %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst 866 ret void 867} 868 869define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 870; GCN1-LABEL: flat_atomic_add_i32_noret_offset_scalar: 871; GCN1: ; %bb.0: 872; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 873; GCN1-NEXT: s_add_u32 s34, s4, 16 874; GCN1-NEXT: s_addc_u32 s35, s5, 0 875; GCN1-NEXT: v_mov_b32_e32 v0, s34 876; GCN1-NEXT: v_mov_b32_e32 v1, s35 877; GCN1-NEXT: v_mov_b32_e32 v2, s6 878; GCN1-NEXT: flat_atomic_add v[0:1], v2 879; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 880; GCN1-NEXT: buffer_wbinvl1_vol 881; GCN1-NEXT: s_setpc_b64 s[30:31] 882; 883; GCN2-LABEL: flat_atomic_add_i32_noret_offset_scalar: 884; GCN2: ; %bb.0: 885; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 886; GCN2-NEXT: s_add_u32 s34, s4, 16 887; GCN2-NEXT: s_addc_u32 s35, s5, 0 888; GCN2-NEXT: v_mov_b32_e32 v0, s34 889; GCN2-NEXT: v_mov_b32_e32 v1, s35 890; GCN2-NEXT: v_mov_b32_e32 v2, s6 891; GCN2-NEXT: flat_atomic_add v[0:1], v2 892; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 893; GCN2-NEXT: buffer_wbinvl1_vol 894; GCN2-NEXT: s_setpc_b64 s[30:31] 895; 896; GCN3-LABEL: flat_atomic_add_i32_noret_offset_scalar: 897; GCN3: ; %bb.0: 898; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 899; GCN3-NEXT: v_mov_b32_e32 v0, s4 900; GCN3-NEXT: v_mov_b32_e32 v1, s5 901; GCN3-NEXT: v_mov_b32_e32 v2, s6 902; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 903; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 904; GCN3-NEXT: buffer_wbinvl1_vol 905; GCN3-NEXT: s_setpc_b64 s[30:31] 906 %gep = getelementptr i32, ptr %out, i32 4 907 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst 908 ret void 909} 910 911define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 912; GCN1-LABEL: flat_atomic_add_i32_ret_scalar: 913; GCN1: ; %bb.0: 914; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 915; GCN1-NEXT: v_mov_b32_e32 v0, s4 916; GCN1-NEXT: v_mov_b32_e32 v1, s5 917; GCN1-NEXT: v_mov_b32_e32 v2, s6 918; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc 919; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 920; GCN1-NEXT: buffer_wbinvl1_vol 921; GCN1-NEXT: s_setpc_b64 s[30:31] 922; 923; GCN2-LABEL: flat_atomic_add_i32_ret_scalar: 924; GCN2: ; %bb.0: 925; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 926; GCN2-NEXT: v_mov_b32_e32 v0, s4 927; GCN2-NEXT: v_mov_b32_e32 v1, s5 928; GCN2-NEXT: v_mov_b32_e32 v2, s6 929; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc 930; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 931; GCN2-NEXT: buffer_wbinvl1_vol 932; GCN2-NEXT: s_setpc_b64 s[30:31] 933; 934; GCN3-LABEL: flat_atomic_add_i32_ret_scalar: 935; GCN3: ; %bb.0: 936; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 937; GCN3-NEXT: v_mov_b32_e32 v0, s4 938; GCN3-NEXT: v_mov_b32_e32 v1, s5 939; GCN3-NEXT: v_mov_b32_e32 v2, s6 940; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc 941; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 942; GCN3-NEXT: buffer_wbinvl1_vol 943; GCN3-NEXT: s_setpc_b64 s[30:31] 944 %result = atomicrmw add ptr %ptr, i32 %in seq_cst 945 ret i32 %result 946} 947 948define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 949; GCN1-LABEL: flat_atomic_add_i32_ret_offset_scalar: 950; GCN1: ; %bb.0: 951; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 952; GCN1-NEXT: s_add_u32 s34, s4, 16 953; GCN1-NEXT: s_addc_u32 s35, s5, 0 954; GCN1-NEXT: v_mov_b32_e32 v0, s34 955; GCN1-NEXT: v_mov_b32_e32 v1, s35 956; GCN1-NEXT: v_mov_b32_e32 v2, s6 957; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc 958; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 959; GCN1-NEXT: buffer_wbinvl1_vol 960; GCN1-NEXT: s_setpc_b64 s[30:31] 961; 962; GCN2-LABEL: flat_atomic_add_i32_ret_offset_scalar: 963; GCN2: ; %bb.0: 964; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 965; GCN2-NEXT: s_add_u32 s34, s4, 16 966; GCN2-NEXT: s_addc_u32 s35, s5, 0 967; GCN2-NEXT: v_mov_b32_e32 v0, s34 968; GCN2-NEXT: v_mov_b32_e32 v1, s35 969; GCN2-NEXT: v_mov_b32_e32 v2, s6 970; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc 971; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 972; GCN2-NEXT: buffer_wbinvl1_vol 973; GCN2-NEXT: s_setpc_b64 s[30:31] 974; 975; GCN3-LABEL: flat_atomic_add_i32_ret_offset_scalar: 976; GCN3: ; %bb.0: 977; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 978; GCN3-NEXT: v_mov_b32_e32 v0, s4 979; GCN3-NEXT: v_mov_b32_e32 v1, s5 980; GCN3-NEXT: v_mov_b32_e32 v2, s6 981; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc 982; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 983; GCN3-NEXT: buffer_wbinvl1_vol 984; GCN3-NEXT: s_setpc_b64 s[30:31] 985 %gep = getelementptr i32, ptr %out, i32 4 986 %result = atomicrmw add ptr %gep, i32 %in seq_cst 987 ret i32 %result 988} 989 990define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 991; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory: 992; GCN1: ; %bb.0: 993; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 994; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 995; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 996; GCN1-NEXT: flat_atomic_add v[0:1], v2 997; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 998; GCN1-NEXT: buffer_wbinvl1_vol 999; GCN1-NEXT: s_setpc_b64 s[30:31] 1000; 1001; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory: 1002; GCN2: ; %bb.0: 1003; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1004; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1005; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1006; GCN2-NEXT: flat_atomic_add v[0:1], v2 1007; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1008; GCN2-NEXT: buffer_wbinvl1_vol 1009; GCN2-NEXT: s_setpc_b64 s[30:31] 1010; 1011; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory: 1012; GCN3: ; %bb.0: 1013; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1014; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 1015; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1016; GCN3-NEXT: buffer_wbinvl1_vol 1017; GCN3-NEXT: s_setpc_b64 s[30:31] 1018 %gep = getelementptr i32, ptr %out, i64 4 1019 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1020 ret void 1021} 1022 1023define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 1024; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory: 1025; GCN1: ; %bb.0: 1026; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1027; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1028; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1029; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc 1030; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1031; GCN1-NEXT: buffer_wbinvl1_vol 1032; GCN1-NEXT: s_setpc_b64 s[30:31] 1033; 1034; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory: 1035; GCN2: ; %bb.0: 1036; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1037; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1038; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1039; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc 1040; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1041; GCN2-NEXT: buffer_wbinvl1_vol 1042; GCN2-NEXT: s_setpc_b64 s[30:31] 1043; 1044; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory: 1045; GCN3: ; %bb.0: 1046; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1047; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc 1048; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1049; GCN3-NEXT: buffer_wbinvl1_vol 1050; GCN3-NEXT: s_setpc_b64 s[30:31] 1051 %gep = getelementptr i32, ptr %out, i64 4 1052 %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1053 ret i32 %result 1054} 1055 1056; --------------------------------------------------------------------- 1057; atomicrmw sub 1058; --------------------------------------------------------------------- 1059 1060define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) { 1061; GCN1-LABEL: flat_atomic_sub_i32_noret: 1062; GCN1: ; %bb.0: 1063; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1064; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1065; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1066; GCN1-NEXT: buffer_wbinvl1_vol 1067; GCN1-NEXT: s_setpc_b64 s[30:31] 1068; 1069; GCN2-LABEL: flat_atomic_sub_i32_noret: 1070; GCN2: ; %bb.0: 1071; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1072; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1073; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1074; GCN2-NEXT: buffer_wbinvl1_vol 1075; GCN2-NEXT: s_setpc_b64 s[30:31] 1076; 1077; GCN3-LABEL: flat_atomic_sub_i32_noret: 1078; GCN3: ; %bb.0: 1079; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1080; GCN3-NEXT: flat_atomic_sub v[0:1], v2 1081; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1082; GCN3-NEXT: buffer_wbinvl1_vol 1083; GCN3-NEXT: s_setpc_b64 s[30:31] 1084 %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst 1085 ret void 1086} 1087 1088define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { 1089; GCN1-LABEL: flat_atomic_sub_i32_noret_offset: 1090; GCN1: ; %bb.0: 1091; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1092; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1093; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1094; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1095; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1096; GCN1-NEXT: buffer_wbinvl1_vol 1097; GCN1-NEXT: s_setpc_b64 s[30:31] 1098; 1099; GCN2-LABEL: flat_atomic_sub_i32_noret_offset: 1100; GCN2: ; %bb.0: 1101; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1102; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1103; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1104; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1105; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1106; GCN2-NEXT: buffer_wbinvl1_vol 1107; GCN2-NEXT: s_setpc_b64 s[30:31] 1108; 1109; GCN3-LABEL: flat_atomic_sub_i32_noret_offset: 1110; GCN3: ; %bb.0: 1111; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1112; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 1113; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1114; GCN3-NEXT: buffer_wbinvl1_vol 1115; GCN3-NEXT: s_setpc_b64 s[30:31] 1116 %gep = getelementptr i32, ptr %out, i32 4 1117 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst 1118 ret void 1119} 1120 1121define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) { 1122; GCN1-LABEL: flat_atomic_sub_i32_ret: 1123; GCN1: ; %bb.0: 1124; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1125; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1126; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1127; GCN1-NEXT: buffer_wbinvl1_vol 1128; GCN1-NEXT: s_setpc_b64 s[30:31] 1129; 1130; GCN2-LABEL: flat_atomic_sub_i32_ret: 1131; GCN2: ; %bb.0: 1132; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1133; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1134; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1135; GCN2-NEXT: buffer_wbinvl1_vol 1136; GCN2-NEXT: s_setpc_b64 s[30:31] 1137; 1138; GCN3-LABEL: flat_atomic_sub_i32_ret: 1139; GCN3: ; %bb.0: 1140; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1141; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1142; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1143; GCN3-NEXT: buffer_wbinvl1_vol 1144; GCN3-NEXT: s_setpc_b64 s[30:31] 1145 %result = atomicrmw sub ptr %ptr, i32 %in seq_cst 1146 ret i32 %result 1147} 1148 1149define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { 1150; GCN1-LABEL: flat_atomic_sub_i32_ret_offset: 1151; GCN1: ; %bb.0: 1152; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1153; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1154; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1155; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1156; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1157; GCN1-NEXT: buffer_wbinvl1_vol 1158; GCN1-NEXT: s_setpc_b64 s[30:31] 1159; 1160; GCN2-LABEL: flat_atomic_sub_i32_ret_offset: 1161; GCN2: ; %bb.0: 1162; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1163; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1164; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1165; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1166; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1167; GCN2-NEXT: buffer_wbinvl1_vol 1168; GCN2-NEXT: s_setpc_b64 s[30:31] 1169; 1170; GCN3-LABEL: flat_atomic_sub_i32_ret_offset: 1171; GCN3: ; %bb.0: 1172; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1173; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc 1174; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1175; GCN3-NEXT: buffer_wbinvl1_vol 1176; GCN3-NEXT: s_setpc_b64 s[30:31] 1177 %gep = getelementptr i32, ptr %out, i32 4 1178 %result = atomicrmw sub ptr %gep, i32 %in seq_cst 1179 ret i32 %result 1180} 1181 1182define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 1183; GCN1-LABEL: flat_atomic_sub_i32_noret_scalar: 1184; GCN1: ; %bb.0: 1185; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1186; GCN1-NEXT: v_mov_b32_e32 v0, s4 1187; GCN1-NEXT: v_mov_b32_e32 v1, s5 1188; GCN1-NEXT: v_mov_b32_e32 v2, s6 1189; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1190; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1191; GCN1-NEXT: buffer_wbinvl1_vol 1192; GCN1-NEXT: s_setpc_b64 s[30:31] 1193; 1194; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar: 1195; GCN2: ; %bb.0: 1196; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1197; GCN2-NEXT: v_mov_b32_e32 v0, s4 1198; GCN2-NEXT: v_mov_b32_e32 v1, s5 1199; GCN2-NEXT: v_mov_b32_e32 v2, s6 1200; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1201; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1202; GCN2-NEXT: buffer_wbinvl1_vol 1203; GCN2-NEXT: s_setpc_b64 s[30:31] 1204; 1205; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar: 1206; GCN3: ; %bb.0: 1207; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1208; GCN3-NEXT: v_mov_b32_e32 v0, s4 1209; GCN3-NEXT: v_mov_b32_e32 v1, s5 1210; GCN3-NEXT: v_mov_b32_e32 v2, s6 1211; GCN3-NEXT: flat_atomic_sub v[0:1], v2 1212; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1213; GCN3-NEXT: buffer_wbinvl1_vol 1214; GCN3-NEXT: s_setpc_b64 s[30:31] 1215 %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst 1216 ret void 1217} 1218 1219define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 1220; GCN1-LABEL: flat_atomic_sub_i32_noret_offset_scalar: 1221; GCN1: ; %bb.0: 1222; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1223; GCN1-NEXT: s_add_u32 s34, s4, 16 1224; GCN1-NEXT: s_addc_u32 s35, s5, 0 1225; GCN1-NEXT: v_mov_b32_e32 v0, s34 1226; GCN1-NEXT: v_mov_b32_e32 v1, s35 1227; GCN1-NEXT: v_mov_b32_e32 v2, s6 1228; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1229; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1230; GCN1-NEXT: buffer_wbinvl1_vol 1231; GCN1-NEXT: s_setpc_b64 s[30:31] 1232; 1233; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar: 1234; GCN2: ; %bb.0: 1235; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1236; GCN2-NEXT: s_add_u32 s34, s4, 16 1237; GCN2-NEXT: s_addc_u32 s35, s5, 0 1238; GCN2-NEXT: v_mov_b32_e32 v0, s34 1239; GCN2-NEXT: v_mov_b32_e32 v1, s35 1240; GCN2-NEXT: v_mov_b32_e32 v2, s6 1241; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1242; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1243; GCN2-NEXT: buffer_wbinvl1_vol 1244; GCN2-NEXT: s_setpc_b64 s[30:31] 1245; 1246; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar: 1247; GCN3: ; %bb.0: 1248; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1249; GCN3-NEXT: v_mov_b32_e32 v0, s4 1250; GCN3-NEXT: v_mov_b32_e32 v1, s5 1251; GCN3-NEXT: v_mov_b32_e32 v2, s6 1252; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 1253; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1254; GCN3-NEXT: buffer_wbinvl1_vol 1255; GCN3-NEXT: s_setpc_b64 s[30:31] 1256 %gep = getelementptr i32, ptr %out, i32 4 1257 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst 1258 ret void 1259} 1260 1261define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 1262; GCN1-LABEL: flat_atomic_sub_i32_ret_scalar: 1263; GCN1: ; %bb.0: 1264; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1265; GCN1-NEXT: v_mov_b32_e32 v0, s4 1266; GCN1-NEXT: v_mov_b32_e32 v1, s5 1267; GCN1-NEXT: v_mov_b32_e32 v2, s6 1268; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1269; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1270; GCN1-NEXT: buffer_wbinvl1_vol 1271; GCN1-NEXT: s_setpc_b64 s[30:31] 1272; 1273; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar: 1274; GCN2: ; %bb.0: 1275; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1276; GCN2-NEXT: v_mov_b32_e32 v0, s4 1277; GCN2-NEXT: v_mov_b32_e32 v1, s5 1278; GCN2-NEXT: v_mov_b32_e32 v2, s6 1279; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1280; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1281; GCN2-NEXT: buffer_wbinvl1_vol 1282; GCN2-NEXT: s_setpc_b64 s[30:31] 1283; 1284; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar: 1285; GCN3: ; %bb.0: 1286; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1287; GCN3-NEXT: v_mov_b32_e32 v0, s4 1288; GCN3-NEXT: v_mov_b32_e32 v1, s5 1289; GCN3-NEXT: v_mov_b32_e32 v2, s6 1290; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1291; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1292; GCN3-NEXT: buffer_wbinvl1_vol 1293; GCN3-NEXT: s_setpc_b64 s[30:31] 1294 %result = atomicrmw sub ptr %ptr, i32 %in seq_cst 1295 ret i32 %result 1296} 1297 1298define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 1299; GCN1-LABEL: flat_atomic_sub_i32_ret_offset_scalar: 1300; GCN1: ; %bb.0: 1301; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1302; GCN1-NEXT: s_add_u32 s34, s4, 16 1303; GCN1-NEXT: s_addc_u32 s35, s5, 0 1304; GCN1-NEXT: v_mov_b32_e32 v0, s34 1305; GCN1-NEXT: v_mov_b32_e32 v1, s35 1306; GCN1-NEXT: v_mov_b32_e32 v2, s6 1307; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1308; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1309; GCN1-NEXT: buffer_wbinvl1_vol 1310; GCN1-NEXT: s_setpc_b64 s[30:31] 1311; 1312; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar: 1313; GCN2: ; %bb.0: 1314; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1315; GCN2-NEXT: s_add_u32 s34, s4, 16 1316; GCN2-NEXT: s_addc_u32 s35, s5, 0 1317; GCN2-NEXT: v_mov_b32_e32 v0, s34 1318; GCN2-NEXT: v_mov_b32_e32 v1, s35 1319; GCN2-NEXT: v_mov_b32_e32 v2, s6 1320; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1321; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1322; GCN2-NEXT: buffer_wbinvl1_vol 1323; GCN2-NEXT: s_setpc_b64 s[30:31] 1324; 1325; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar: 1326; GCN3: ; %bb.0: 1327; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1328; GCN3-NEXT: v_mov_b32_e32 v0, s4 1329; GCN3-NEXT: v_mov_b32_e32 v1, s5 1330; GCN3-NEXT: v_mov_b32_e32 v2, s6 1331; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc 1332; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1333; GCN3-NEXT: buffer_wbinvl1_vol 1334; GCN3-NEXT: s_setpc_b64 s[30:31] 1335 %gep = getelementptr i32, ptr %out, i32 4 1336 %result = atomicrmw sub ptr %gep, i32 %in seq_cst 1337 ret i32 %result 1338} 1339 1340define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 1341; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: 1342; GCN1: ; %bb.0: 1343; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1344; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1345; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1346; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1347; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1348; GCN1-NEXT: buffer_wbinvl1_vol 1349; GCN1-NEXT: s_setpc_b64 s[30:31] 1350; 1351; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: 1352; GCN2: ; %bb.0: 1353; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1354; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1355; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1356; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1357; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1358; GCN2-NEXT: buffer_wbinvl1_vol 1359; GCN2-NEXT: s_setpc_b64 s[30:31] 1360; 1361; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: 1362; GCN3: ; %bb.0: 1363; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1364; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 1365; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1366; GCN3-NEXT: buffer_wbinvl1_vol 1367; GCN3-NEXT: s_setpc_b64 s[30:31] 1368 %gep = getelementptr i32, ptr %out, i64 4 1369 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1370 ret void 1371} 1372 1373define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 1374; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: 1375; GCN1: ; %bb.0: 1376; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1377; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1378; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1379; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1380; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1381; GCN1-NEXT: buffer_wbinvl1_vol 1382; GCN1-NEXT: s_setpc_b64 s[30:31] 1383; 1384; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: 1385; GCN2: ; %bb.0: 1386; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1387; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1388; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1389; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc 1390; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1391; GCN2-NEXT: buffer_wbinvl1_vol 1392; GCN2-NEXT: s_setpc_b64 s[30:31] 1393; 1394; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: 1395; GCN3: ; %bb.0: 1396; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1397; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc 1398; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1399; GCN3-NEXT: buffer_wbinvl1_vol 1400; GCN3-NEXT: s_setpc_b64 s[30:31] 1401 %gep = getelementptr i32, ptr %out, i64 4 1402 %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1403 ret i32 %result 1404} 1405 1406; --------------------------------------------------------------------- 1407; atomicrmw and 1408; --------------------------------------------------------------------- 1409 1410define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) { 1411; GCN1-LABEL: flat_atomic_and_i32_noret: 1412; GCN1: ; %bb.0: 1413; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1414; GCN1-NEXT: flat_atomic_and v[0:1], v2 1415; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1416; GCN1-NEXT: buffer_wbinvl1_vol 1417; GCN1-NEXT: s_setpc_b64 s[30:31] 1418; 1419; GCN2-LABEL: flat_atomic_and_i32_noret: 1420; GCN2: ; %bb.0: 1421; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1422; GCN2-NEXT: flat_atomic_and v[0:1], v2 1423; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1424; GCN2-NEXT: buffer_wbinvl1_vol 1425; GCN2-NEXT: s_setpc_b64 s[30:31] 1426; 1427; GCN3-LABEL: flat_atomic_and_i32_noret: 1428; GCN3: ; %bb.0: 1429; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1430; GCN3-NEXT: flat_atomic_and v[0:1], v2 1431; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1432; GCN3-NEXT: buffer_wbinvl1_vol 1433; GCN3-NEXT: s_setpc_b64 s[30:31] 1434 %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst 1435 ret void 1436} 1437 1438define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { 1439; GCN1-LABEL: flat_atomic_and_i32_noret_offset: 1440; GCN1: ; %bb.0: 1441; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1442; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1443; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1444; GCN1-NEXT: flat_atomic_and v[0:1], v2 1445; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1446; GCN1-NEXT: buffer_wbinvl1_vol 1447; GCN1-NEXT: s_setpc_b64 s[30:31] 1448; 1449; GCN2-LABEL: flat_atomic_and_i32_noret_offset: 1450; GCN2: ; %bb.0: 1451; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1452; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1453; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1454; GCN2-NEXT: flat_atomic_and v[0:1], v2 1455; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1456; GCN2-NEXT: buffer_wbinvl1_vol 1457; GCN2-NEXT: s_setpc_b64 s[30:31] 1458; 1459; GCN3-LABEL: flat_atomic_and_i32_noret_offset: 1460; GCN3: ; %bb.0: 1461; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1462; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 1463; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1464; GCN3-NEXT: buffer_wbinvl1_vol 1465; GCN3-NEXT: s_setpc_b64 s[30:31] 1466 %gep = getelementptr i32, ptr %out, i32 4 1467 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst 1468 ret void 1469} 1470 1471define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) { 1472; GCN1-LABEL: flat_atomic_and_i32_ret: 1473; GCN1: ; %bb.0: 1474; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1475; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1476; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1477; GCN1-NEXT: buffer_wbinvl1_vol 1478; GCN1-NEXT: s_setpc_b64 s[30:31] 1479; 1480; GCN2-LABEL: flat_atomic_and_i32_ret: 1481; GCN2: ; %bb.0: 1482; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1483; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1484; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1485; GCN2-NEXT: buffer_wbinvl1_vol 1486; GCN2-NEXT: s_setpc_b64 s[30:31] 1487; 1488; GCN3-LABEL: flat_atomic_and_i32_ret: 1489; GCN3: ; %bb.0: 1490; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1491; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1492; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1493; GCN3-NEXT: buffer_wbinvl1_vol 1494; GCN3-NEXT: s_setpc_b64 s[30:31] 1495 %result = atomicrmw and ptr %ptr, i32 %in seq_cst 1496 ret i32 %result 1497} 1498 1499define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { 1500; GCN1-LABEL: flat_atomic_and_i32_ret_offset: 1501; GCN1: ; %bb.0: 1502; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1503; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1504; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1505; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1506; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1507; GCN1-NEXT: buffer_wbinvl1_vol 1508; GCN1-NEXT: s_setpc_b64 s[30:31] 1509; 1510; GCN2-LABEL: flat_atomic_and_i32_ret_offset: 1511; GCN2: ; %bb.0: 1512; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1513; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1514; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1515; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1516; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1517; GCN2-NEXT: buffer_wbinvl1_vol 1518; GCN2-NEXT: s_setpc_b64 s[30:31] 1519; 1520; GCN3-LABEL: flat_atomic_and_i32_ret_offset: 1521; GCN3: ; %bb.0: 1522; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1523; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc 1524; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1525; GCN3-NEXT: buffer_wbinvl1_vol 1526; GCN3-NEXT: s_setpc_b64 s[30:31] 1527 %gep = getelementptr i32, ptr %out, i32 4 1528 %result = atomicrmw and ptr %gep, i32 %in seq_cst 1529 ret i32 %result 1530} 1531 1532define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 1533; GCN1-LABEL: flat_atomic_and_i32_noret_scalar: 1534; GCN1: ; %bb.0: 1535; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1536; GCN1-NEXT: v_mov_b32_e32 v0, s4 1537; GCN1-NEXT: v_mov_b32_e32 v1, s5 1538; GCN1-NEXT: v_mov_b32_e32 v2, s6 1539; GCN1-NEXT: flat_atomic_and v[0:1], v2 1540; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1541; GCN1-NEXT: buffer_wbinvl1_vol 1542; GCN1-NEXT: s_setpc_b64 s[30:31] 1543; 1544; GCN2-LABEL: flat_atomic_and_i32_noret_scalar: 1545; GCN2: ; %bb.0: 1546; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1547; GCN2-NEXT: v_mov_b32_e32 v0, s4 1548; GCN2-NEXT: v_mov_b32_e32 v1, s5 1549; GCN2-NEXT: v_mov_b32_e32 v2, s6 1550; GCN2-NEXT: flat_atomic_and v[0:1], v2 1551; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1552; GCN2-NEXT: buffer_wbinvl1_vol 1553; GCN2-NEXT: s_setpc_b64 s[30:31] 1554; 1555; GCN3-LABEL: flat_atomic_and_i32_noret_scalar: 1556; GCN3: ; %bb.0: 1557; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1558; GCN3-NEXT: v_mov_b32_e32 v0, s4 1559; GCN3-NEXT: v_mov_b32_e32 v1, s5 1560; GCN3-NEXT: v_mov_b32_e32 v2, s6 1561; GCN3-NEXT: flat_atomic_and v[0:1], v2 1562; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1563; GCN3-NEXT: buffer_wbinvl1_vol 1564; GCN3-NEXT: s_setpc_b64 s[30:31] 1565 %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst 1566 ret void 1567} 1568 1569define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 1570; GCN1-LABEL: flat_atomic_and_i32_noret_offset_scalar: 1571; GCN1: ; %bb.0: 1572; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1573; GCN1-NEXT: s_add_u32 s34, s4, 16 1574; GCN1-NEXT: s_addc_u32 s35, s5, 0 1575; GCN1-NEXT: v_mov_b32_e32 v0, s34 1576; GCN1-NEXT: v_mov_b32_e32 v1, s35 1577; GCN1-NEXT: v_mov_b32_e32 v2, s6 1578; GCN1-NEXT: flat_atomic_and v[0:1], v2 1579; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1580; GCN1-NEXT: buffer_wbinvl1_vol 1581; GCN1-NEXT: s_setpc_b64 s[30:31] 1582; 1583; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar: 1584; GCN2: ; %bb.0: 1585; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1586; GCN2-NEXT: s_add_u32 s34, s4, 16 1587; GCN2-NEXT: s_addc_u32 s35, s5, 0 1588; GCN2-NEXT: v_mov_b32_e32 v0, s34 1589; GCN2-NEXT: v_mov_b32_e32 v1, s35 1590; GCN2-NEXT: v_mov_b32_e32 v2, s6 1591; GCN2-NEXT: flat_atomic_and v[0:1], v2 1592; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1593; GCN2-NEXT: buffer_wbinvl1_vol 1594; GCN2-NEXT: s_setpc_b64 s[30:31] 1595; 1596; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar: 1597; GCN3: ; %bb.0: 1598; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1599; GCN3-NEXT: v_mov_b32_e32 v0, s4 1600; GCN3-NEXT: v_mov_b32_e32 v1, s5 1601; GCN3-NEXT: v_mov_b32_e32 v2, s6 1602; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 1603; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1604; GCN3-NEXT: buffer_wbinvl1_vol 1605; GCN3-NEXT: s_setpc_b64 s[30:31] 1606 %gep = getelementptr i32, ptr %out, i32 4 1607 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst 1608 ret void 1609} 1610 1611define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 1612; GCN1-LABEL: flat_atomic_and_i32_ret_scalar: 1613; GCN1: ; %bb.0: 1614; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1615; GCN1-NEXT: v_mov_b32_e32 v0, s4 1616; GCN1-NEXT: v_mov_b32_e32 v1, s5 1617; GCN1-NEXT: v_mov_b32_e32 v2, s6 1618; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1619; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1620; GCN1-NEXT: buffer_wbinvl1_vol 1621; GCN1-NEXT: s_setpc_b64 s[30:31] 1622; 1623; GCN2-LABEL: flat_atomic_and_i32_ret_scalar: 1624; GCN2: ; %bb.0: 1625; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1626; GCN2-NEXT: v_mov_b32_e32 v0, s4 1627; GCN2-NEXT: v_mov_b32_e32 v1, s5 1628; GCN2-NEXT: v_mov_b32_e32 v2, s6 1629; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1630; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1631; GCN2-NEXT: buffer_wbinvl1_vol 1632; GCN2-NEXT: s_setpc_b64 s[30:31] 1633; 1634; GCN3-LABEL: flat_atomic_and_i32_ret_scalar: 1635; GCN3: ; %bb.0: 1636; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1637; GCN3-NEXT: v_mov_b32_e32 v0, s4 1638; GCN3-NEXT: v_mov_b32_e32 v1, s5 1639; GCN3-NEXT: v_mov_b32_e32 v2, s6 1640; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1641; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1642; GCN3-NEXT: buffer_wbinvl1_vol 1643; GCN3-NEXT: s_setpc_b64 s[30:31] 1644 %result = atomicrmw and ptr %ptr, i32 %in seq_cst 1645 ret i32 %result 1646} 1647 1648define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 1649; GCN1-LABEL: flat_atomic_and_i32_ret_offset_scalar: 1650; GCN1: ; %bb.0: 1651; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1652; GCN1-NEXT: s_add_u32 s34, s4, 16 1653; GCN1-NEXT: s_addc_u32 s35, s5, 0 1654; GCN1-NEXT: v_mov_b32_e32 v0, s34 1655; GCN1-NEXT: v_mov_b32_e32 v1, s35 1656; GCN1-NEXT: v_mov_b32_e32 v2, s6 1657; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1658; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1659; GCN1-NEXT: buffer_wbinvl1_vol 1660; GCN1-NEXT: s_setpc_b64 s[30:31] 1661; 1662; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar: 1663; GCN2: ; %bb.0: 1664; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1665; GCN2-NEXT: s_add_u32 s34, s4, 16 1666; GCN2-NEXT: s_addc_u32 s35, s5, 0 1667; GCN2-NEXT: v_mov_b32_e32 v0, s34 1668; GCN2-NEXT: v_mov_b32_e32 v1, s35 1669; GCN2-NEXT: v_mov_b32_e32 v2, s6 1670; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1671; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1672; GCN2-NEXT: buffer_wbinvl1_vol 1673; GCN2-NEXT: s_setpc_b64 s[30:31] 1674; 1675; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar: 1676; GCN3: ; %bb.0: 1677; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1678; GCN3-NEXT: v_mov_b32_e32 v0, s4 1679; GCN3-NEXT: v_mov_b32_e32 v1, s5 1680; GCN3-NEXT: v_mov_b32_e32 v2, s6 1681; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc 1682; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1683; GCN3-NEXT: buffer_wbinvl1_vol 1684; GCN3-NEXT: s_setpc_b64 s[30:31] 1685 %gep = getelementptr i32, ptr %out, i32 4 1686 %result = atomicrmw and ptr %gep, i32 %in seq_cst 1687 ret i32 %result 1688} 1689 1690define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 1691; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: 1692; GCN1: ; %bb.0: 1693; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1694; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1695; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1696; GCN1-NEXT: flat_atomic_and v[0:1], v2 1697; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1698; GCN1-NEXT: buffer_wbinvl1_vol 1699; GCN1-NEXT: s_setpc_b64 s[30:31] 1700; 1701; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: 1702; GCN2: ; %bb.0: 1703; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1704; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1705; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1706; GCN2-NEXT: flat_atomic_and v[0:1], v2 1707; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1708; GCN2-NEXT: buffer_wbinvl1_vol 1709; GCN2-NEXT: s_setpc_b64 s[30:31] 1710; 1711; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: 1712; GCN3: ; %bb.0: 1713; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1714; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 1715; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1716; GCN3-NEXT: buffer_wbinvl1_vol 1717; GCN3-NEXT: s_setpc_b64 s[30:31] 1718 %gep = getelementptr i32, ptr %out, i64 4 1719 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1720 ret void 1721} 1722 1723define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 1724; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: 1725; GCN1: ; %bb.0: 1726; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1727; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1728; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1729; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1730; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1731; GCN1-NEXT: buffer_wbinvl1_vol 1732; GCN1-NEXT: s_setpc_b64 s[30:31] 1733; 1734; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: 1735; GCN2: ; %bb.0: 1736; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1737; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1738; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1739; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc 1740; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1741; GCN2-NEXT: buffer_wbinvl1_vol 1742; GCN2-NEXT: s_setpc_b64 s[30:31] 1743; 1744; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: 1745; GCN3: ; %bb.0: 1746; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1747; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc 1748; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1749; GCN3-NEXT: buffer_wbinvl1_vol 1750; GCN3-NEXT: s_setpc_b64 s[30:31] 1751 %gep = getelementptr i32, ptr %out, i64 4 1752 %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 1753 ret i32 %result 1754} 1755 1756; --------------------------------------------------------------------- 1757; atomicrmw nand 1758; --------------------------------------------------------------------- 1759 1760define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { 1761; GCN1-LABEL: flat_atomic_nand_i32_noret: 1762; GCN1: ; %bb.0: 1763; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1764; GCN1-NEXT: flat_load_dword v4, v[0:1] 1765; GCN1-NEXT: s_mov_b64 s[4:5], 0 1766; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start 1767; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 1768; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1769; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 1770; GCN1-NEXT: v_not_b32_e32 v3, v3 1771; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1772; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1773; GCN1-NEXT: buffer_wbinvl1_vol 1774; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1775; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1776; GCN1-NEXT: v_mov_b32_e32 v4, v3 1777; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 1778; GCN1-NEXT: s_cbranch_execnz .LBB50_1 1779; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 1780; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 1781; GCN1-NEXT: s_setpc_b64 s[30:31] 1782; 1783; GCN2-LABEL: flat_atomic_nand_i32_noret: 1784; GCN2: ; %bb.0: 1785; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1786; GCN2-NEXT: flat_load_dword v4, v[0:1] 1787; GCN2-NEXT: s_mov_b64 s[4:5], 0 1788; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start 1789; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 1790; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1791; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 1792; GCN2-NEXT: v_not_b32_e32 v3, v3 1793; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1794; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1795; GCN2-NEXT: buffer_wbinvl1_vol 1796; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1797; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1798; GCN2-NEXT: v_mov_b32_e32 v4, v3 1799; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 1800; GCN2-NEXT: s_cbranch_execnz .LBB50_1 1801; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 1802; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 1803; GCN2-NEXT: s_setpc_b64 s[30:31] 1804; 1805; GCN3-LABEL: flat_atomic_nand_i32_noret: 1806; GCN3: ; %bb.0: 1807; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1808; GCN3-NEXT: flat_load_dword v4, v[0:1] 1809; GCN3-NEXT: s_mov_b64 s[4:5], 0 1810; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start 1811; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 1812; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1813; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 1814; GCN3-NEXT: v_not_b32_e32 v3, v3 1815; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1816; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1817; GCN3-NEXT: buffer_wbinvl1_vol 1818; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1819; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1820; GCN3-NEXT: v_mov_b32_e32 v4, v3 1821; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 1822; GCN3-NEXT: s_cbranch_execnz .LBB50_1 1823; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 1824; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 1825; GCN3-NEXT: s_setpc_b64 s[30:31] 1826 %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst 1827 ret void 1828} 1829 1830define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { 1831; GCN1-LABEL: flat_atomic_nand_i32_noret_offset: 1832; GCN1: ; %bb.0: 1833; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1834; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 1835; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1836; GCN1-NEXT: flat_load_dword v4, v[0:1] 1837; GCN1-NEXT: s_mov_b64 s[4:5], 0 1838; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start 1839; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 1840; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1841; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 1842; GCN1-NEXT: v_not_b32_e32 v3, v3 1843; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1844; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1845; GCN1-NEXT: buffer_wbinvl1_vol 1846; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1847; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1848; GCN1-NEXT: v_mov_b32_e32 v4, v3 1849; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 1850; GCN1-NEXT: s_cbranch_execnz .LBB51_1 1851; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 1852; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 1853; GCN1-NEXT: s_setpc_b64 s[30:31] 1854; 1855; GCN2-LABEL: flat_atomic_nand_i32_noret_offset: 1856; GCN2: ; %bb.0: 1857; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1858; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 1859; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1860; GCN2-NEXT: flat_load_dword v4, v[0:1] 1861; GCN2-NEXT: s_mov_b64 s[4:5], 0 1862; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start 1863; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 1864; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1865; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 1866; GCN2-NEXT: v_not_b32_e32 v3, v3 1867; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1868; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1869; GCN2-NEXT: buffer_wbinvl1_vol 1870; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1871; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1872; GCN2-NEXT: v_mov_b32_e32 v4, v3 1873; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 1874; GCN2-NEXT: s_cbranch_execnz .LBB51_1 1875; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 1876; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 1877; GCN2-NEXT: s_setpc_b64 s[30:31] 1878; 1879; GCN3-LABEL: flat_atomic_nand_i32_noret_offset: 1880; GCN3: ; %bb.0: 1881; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1882; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 1883; GCN3-NEXT: s_mov_b64 s[4:5], 0 1884; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start 1885; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 1886; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1887; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 1888; GCN3-NEXT: v_not_b32_e32 v3, v3 1889; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 1890; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1891; GCN3-NEXT: buffer_wbinvl1_vol 1892; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1893; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1894; GCN3-NEXT: v_mov_b32_e32 v4, v3 1895; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 1896; GCN3-NEXT: s_cbranch_execnz .LBB51_1 1897; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 1898; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 1899; GCN3-NEXT: s_setpc_b64 s[30:31] 1900 %gep = getelementptr i32, ptr %out, i32 4 1901 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst 1902 ret void 1903} 1904 1905define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { 1906; GCN1-LABEL: flat_atomic_nand_i32_ret: 1907; GCN1: ; %bb.0: 1908; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1909; GCN1-NEXT: flat_load_dword v3, v[0:1] 1910; GCN1-NEXT: s_mov_b64 s[4:5], 0 1911; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start 1912; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 1913; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1914; GCN1-NEXT: v_mov_b32_e32 v4, v3 1915; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 1916; GCN1-NEXT: v_not_b32_e32 v3, v3 1917; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1918; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1919; GCN1-NEXT: buffer_wbinvl1_vol 1920; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1921; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1922; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 1923; GCN1-NEXT: s_cbranch_execnz .LBB52_1 1924; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 1925; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 1926; GCN1-NEXT: v_mov_b32_e32 v0, v3 1927; GCN1-NEXT: s_setpc_b64 s[30:31] 1928; 1929; GCN2-LABEL: flat_atomic_nand_i32_ret: 1930; GCN2: ; %bb.0: 1931; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1932; GCN2-NEXT: flat_load_dword v3, v[0:1] 1933; GCN2-NEXT: s_mov_b64 s[4:5], 0 1934; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start 1935; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 1936; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1937; GCN2-NEXT: v_mov_b32_e32 v4, v3 1938; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 1939; GCN2-NEXT: v_not_b32_e32 v3, v3 1940; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1941; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1942; GCN2-NEXT: buffer_wbinvl1_vol 1943; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1944; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1945; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 1946; GCN2-NEXT: s_cbranch_execnz .LBB52_1 1947; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 1948; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 1949; GCN2-NEXT: v_mov_b32_e32 v0, v3 1950; GCN2-NEXT: s_setpc_b64 s[30:31] 1951; 1952; GCN3-LABEL: flat_atomic_nand_i32_ret: 1953; GCN3: ; %bb.0: 1954; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1955; GCN3-NEXT: flat_load_dword v3, v[0:1] 1956; GCN3-NEXT: s_mov_b64 s[4:5], 0 1957; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start 1958; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 1959; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1960; GCN3-NEXT: v_mov_b32_e32 v4, v3 1961; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 1962; GCN3-NEXT: v_not_b32_e32 v3, v3 1963; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1964; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1965; GCN3-NEXT: buffer_wbinvl1_vol 1966; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1967; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1968; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 1969; GCN3-NEXT: s_cbranch_execnz .LBB52_1 1970; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 1971; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 1972; GCN3-NEXT: v_mov_b32_e32 v0, v3 1973; GCN3-NEXT: s_setpc_b64 s[30:31] 1974 %result = atomicrmw nand ptr %ptr, i32 %in seq_cst 1975 ret i32 %result 1976} 1977 1978define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { 1979; GCN1-LABEL: flat_atomic_nand_i32_ret_offset: 1980; GCN1: ; %bb.0: 1981; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1982; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 1983; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1984; GCN1-NEXT: flat_load_dword v0, v[3:4] 1985; GCN1-NEXT: s_mov_b64 s[4:5], 0 1986; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start 1987; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 1988; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1989; GCN1-NEXT: v_mov_b32_e32 v1, v0 1990; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 1991; GCN1-NEXT: v_not_b32_e32 v0, v0 1992; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1993; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1994; GCN1-NEXT: buffer_wbinvl1_vol 1995; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1996; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1997; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 1998; GCN1-NEXT: s_cbranch_execnz .LBB53_1 1999; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2000; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 2001; GCN1-NEXT: s_setpc_b64 s[30:31] 2002; 2003; GCN2-LABEL: flat_atomic_nand_i32_ret_offset: 2004; GCN2: ; %bb.0: 2005; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2006; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 2007; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2008; GCN2-NEXT: flat_load_dword v0, v[3:4] 2009; GCN2-NEXT: s_mov_b64 s[4:5], 0 2010; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start 2011; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2012; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2013; GCN2-NEXT: v_mov_b32_e32 v1, v0 2014; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 2015; GCN2-NEXT: v_not_b32_e32 v0, v0 2016; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2017; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2018; GCN2-NEXT: buffer_wbinvl1_vol 2019; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2020; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2021; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 2022; GCN2-NEXT: s_cbranch_execnz .LBB53_1 2023; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2024; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 2025; GCN2-NEXT: s_setpc_b64 s[30:31] 2026; 2027; GCN3-LABEL: flat_atomic_nand_i32_ret_offset: 2028; GCN3: ; %bb.0: 2029; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2030; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 2031; GCN3-NEXT: s_mov_b64 s[4:5], 0 2032; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start 2033; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2034; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2035; GCN3-NEXT: v_mov_b32_e32 v4, v3 2036; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 2037; GCN3-NEXT: v_not_b32_e32 v3, v3 2038; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 2039; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2040; GCN3-NEXT: buffer_wbinvl1_vol 2041; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2042; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2043; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 2044; GCN3-NEXT: s_cbranch_execnz .LBB53_1 2045; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2046; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 2047; GCN3-NEXT: v_mov_b32_e32 v0, v3 2048; GCN3-NEXT: s_setpc_b64 s[30:31] 2049 %gep = getelementptr i32, ptr %out, i32 4 2050 %result = atomicrmw nand ptr %gep, i32 %in seq_cst 2051 ret i32 %result 2052} 2053 2054define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 2055; GCN1-LABEL: flat_atomic_nand_i32_noret_scalar: 2056; GCN1: ; %bb.0: 2057; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2058; GCN1-NEXT: v_mov_b32_e32 v0, s4 2059; GCN1-NEXT: v_mov_b32_e32 v1, s5 2060; GCN1-NEXT: flat_load_dword v3, v[0:1] 2061; GCN1-NEXT: s_mov_b64 s[34:35], 0 2062; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start 2063; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 2064; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2065; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 2066; GCN1-NEXT: v_not_b32_e32 v2, v2 2067; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2068; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2069; GCN1-NEXT: buffer_wbinvl1_vol 2070; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2071; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2072; GCN1-NEXT: v_mov_b32_e32 v3, v2 2073; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 2074; GCN1-NEXT: s_cbranch_execnz .LBB54_1 2075; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2076; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 2077; GCN1-NEXT: s_setpc_b64 s[30:31] 2078; 2079; GCN2-LABEL: flat_atomic_nand_i32_noret_scalar: 2080; GCN2: ; %bb.0: 2081; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2082; GCN2-NEXT: v_mov_b32_e32 v0, s4 2083; GCN2-NEXT: v_mov_b32_e32 v1, s5 2084; GCN2-NEXT: flat_load_dword v3, v[0:1] 2085; GCN2-NEXT: s_mov_b64 s[34:35], 0 2086; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start 2087; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2088; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2089; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 2090; GCN2-NEXT: v_not_b32_e32 v2, v2 2091; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2092; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2093; GCN2-NEXT: buffer_wbinvl1_vol 2094; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2095; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2096; GCN2-NEXT: v_mov_b32_e32 v3, v2 2097; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 2098; GCN2-NEXT: s_cbranch_execnz .LBB54_1 2099; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2100; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 2101; GCN2-NEXT: s_setpc_b64 s[30:31] 2102; 2103; GCN3-LABEL: flat_atomic_nand_i32_noret_scalar: 2104; GCN3: ; %bb.0: 2105; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2106; GCN3-NEXT: v_mov_b32_e32 v0, s4 2107; GCN3-NEXT: v_mov_b32_e32 v1, s5 2108; GCN3-NEXT: flat_load_dword v3, v[0:1] 2109; GCN3-NEXT: s_mov_b64 s[34:35], 0 2110; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start 2111; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2112; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2113; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 2114; GCN3-NEXT: v_not_b32_e32 v2, v2 2115; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2116; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2117; GCN3-NEXT: buffer_wbinvl1_vol 2118; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2119; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2120; GCN3-NEXT: v_mov_b32_e32 v3, v2 2121; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 2122; GCN3-NEXT: s_cbranch_execnz .LBB54_1 2123; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2124; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 2125; GCN3-NEXT: s_setpc_b64 s[30:31] 2126 %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst 2127 ret void 2128} 2129 2130define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 2131; GCN1-LABEL: flat_atomic_nand_i32_noret_offset_scalar: 2132; GCN1: ; %bb.0: 2133; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2134; GCN1-NEXT: s_add_u32 s34, s4, 16 2135; GCN1-NEXT: s_addc_u32 s35, s5, 0 2136; GCN1-NEXT: v_mov_b32_e32 v0, s34 2137; GCN1-NEXT: v_mov_b32_e32 v1, s35 2138; GCN1-NEXT: flat_load_dword v3, v[0:1] 2139; GCN1-NEXT: s_mov_b64 s[34:35], 0 2140; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start 2141; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 2142; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2143; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 2144; GCN1-NEXT: v_not_b32_e32 v2, v2 2145; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2146; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2147; GCN1-NEXT: buffer_wbinvl1_vol 2148; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2149; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2150; GCN1-NEXT: v_mov_b32_e32 v3, v2 2151; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 2152; GCN1-NEXT: s_cbranch_execnz .LBB55_1 2153; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2154; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 2155; GCN1-NEXT: s_setpc_b64 s[30:31] 2156; 2157; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar: 2158; GCN2: ; %bb.0: 2159; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2160; GCN2-NEXT: s_add_u32 s34, s4, 16 2161; GCN2-NEXT: s_addc_u32 s35, s5, 0 2162; GCN2-NEXT: v_mov_b32_e32 v0, s34 2163; GCN2-NEXT: v_mov_b32_e32 v1, s35 2164; GCN2-NEXT: flat_load_dword v3, v[0:1] 2165; GCN2-NEXT: s_mov_b64 s[34:35], 0 2166; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start 2167; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2168; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2169; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 2170; GCN2-NEXT: v_not_b32_e32 v2, v2 2171; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2172; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2173; GCN2-NEXT: buffer_wbinvl1_vol 2174; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2175; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2176; GCN2-NEXT: v_mov_b32_e32 v3, v2 2177; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 2178; GCN2-NEXT: s_cbranch_execnz .LBB55_1 2179; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2180; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 2181; GCN2-NEXT: s_setpc_b64 s[30:31] 2182; 2183; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar: 2184; GCN3: ; %bb.0: 2185; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2186; GCN3-NEXT: v_mov_b32_e32 v0, s4 2187; GCN3-NEXT: v_mov_b32_e32 v1, s5 2188; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 2189; GCN3-NEXT: s_mov_b64 s[34:35], 0 2190; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start 2191; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2192; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2193; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 2194; GCN3-NEXT: v_not_b32_e32 v2, v2 2195; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2196; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2197; GCN3-NEXT: buffer_wbinvl1_vol 2198; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2199; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2200; GCN3-NEXT: v_mov_b32_e32 v3, v2 2201; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 2202; GCN3-NEXT: s_cbranch_execnz .LBB55_1 2203; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2204; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 2205; GCN3-NEXT: s_setpc_b64 s[30:31] 2206 %gep = getelementptr i32, ptr %out, i32 4 2207 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst 2208 ret void 2209} 2210 2211define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 2212; GCN1-LABEL: flat_atomic_nand_i32_ret_scalar: 2213; GCN1: ; %bb.0: 2214; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2215; GCN1-NEXT: v_mov_b32_e32 v0, s4 2216; GCN1-NEXT: v_mov_b32_e32 v1, s5 2217; GCN1-NEXT: flat_load_dword v0, v[0:1] 2218; GCN1-NEXT: v_mov_b32_e32 v1, s4 2219; GCN1-NEXT: s_mov_b64 s[34:35], 0 2220; GCN1-NEXT: v_mov_b32_e32 v2, s5 2221; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start 2222; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 2223; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2224; GCN1-NEXT: v_mov_b32_e32 v4, v0 2225; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 2226; GCN1-NEXT: v_not_b32_e32 v3, v0 2227; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2228; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2229; GCN1-NEXT: buffer_wbinvl1_vol 2230; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2231; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2232; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 2233; GCN1-NEXT: s_cbranch_execnz .LBB56_1 2234; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2235; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 2236; GCN1-NEXT: s_setpc_b64 s[30:31] 2237; 2238; GCN2-LABEL: flat_atomic_nand_i32_ret_scalar: 2239; GCN2: ; %bb.0: 2240; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2241; GCN2-NEXT: v_mov_b32_e32 v0, s4 2242; GCN2-NEXT: v_mov_b32_e32 v1, s5 2243; GCN2-NEXT: flat_load_dword v0, v[0:1] 2244; GCN2-NEXT: v_mov_b32_e32 v1, s4 2245; GCN2-NEXT: s_mov_b64 s[34:35], 0 2246; GCN2-NEXT: v_mov_b32_e32 v2, s5 2247; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start 2248; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2249; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2250; GCN2-NEXT: v_mov_b32_e32 v4, v0 2251; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 2252; GCN2-NEXT: v_not_b32_e32 v3, v0 2253; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2254; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2255; GCN2-NEXT: buffer_wbinvl1_vol 2256; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2257; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2258; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 2259; GCN2-NEXT: s_cbranch_execnz .LBB56_1 2260; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2261; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 2262; GCN2-NEXT: s_setpc_b64 s[30:31] 2263; 2264; GCN3-LABEL: flat_atomic_nand_i32_ret_scalar: 2265; GCN3: ; %bb.0: 2266; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2267; GCN3-NEXT: v_mov_b32_e32 v0, s4 2268; GCN3-NEXT: v_mov_b32_e32 v1, s5 2269; GCN3-NEXT: flat_load_dword v0, v[0:1] 2270; GCN3-NEXT: v_mov_b32_e32 v1, s4 2271; GCN3-NEXT: s_mov_b64 s[34:35], 0 2272; GCN3-NEXT: v_mov_b32_e32 v2, s5 2273; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start 2274; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2275; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2276; GCN3-NEXT: v_mov_b32_e32 v4, v0 2277; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 2278; GCN3-NEXT: v_not_b32_e32 v3, v0 2279; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2280; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2281; GCN3-NEXT: buffer_wbinvl1_vol 2282; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2283; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2284; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 2285; GCN3-NEXT: s_cbranch_execnz .LBB56_1 2286; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2287; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 2288; GCN3-NEXT: s_setpc_b64 s[30:31] 2289 %result = atomicrmw nand ptr %ptr, i32 %in seq_cst 2290 ret i32 %result 2291} 2292 2293define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 2294; GCN1-LABEL: flat_atomic_nand_i32_ret_offset_scalar: 2295; GCN1: ; %bb.0: 2296; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2297; GCN1-NEXT: s_add_u32 s34, s4, 16 2298; GCN1-NEXT: s_addc_u32 s35, s5, 0 2299; GCN1-NEXT: v_mov_b32_e32 v1, s34 2300; GCN1-NEXT: v_mov_b32_e32 v2, s35 2301; GCN1-NEXT: flat_load_dword v0, v[1:2] 2302; GCN1-NEXT: s_mov_b64 s[34:35], 0 2303; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start 2304; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 2305; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2306; GCN1-NEXT: v_mov_b32_e32 v4, v0 2307; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 2308; GCN1-NEXT: v_not_b32_e32 v3, v0 2309; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2310; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2311; GCN1-NEXT: buffer_wbinvl1_vol 2312; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2313; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2314; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 2315; GCN1-NEXT: s_cbranch_execnz .LBB57_1 2316; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2317; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 2318; GCN1-NEXT: s_setpc_b64 s[30:31] 2319; 2320; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar: 2321; GCN2: ; %bb.0: 2322; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2323; GCN2-NEXT: s_add_u32 s34, s4, 16 2324; GCN2-NEXT: s_addc_u32 s35, s5, 0 2325; GCN2-NEXT: v_mov_b32_e32 v1, s34 2326; GCN2-NEXT: v_mov_b32_e32 v2, s35 2327; GCN2-NEXT: flat_load_dword v0, v[1:2] 2328; GCN2-NEXT: s_mov_b64 s[34:35], 0 2329; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start 2330; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2331; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2332; GCN2-NEXT: v_mov_b32_e32 v4, v0 2333; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 2334; GCN2-NEXT: v_not_b32_e32 v3, v0 2335; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 2336; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2337; GCN2-NEXT: buffer_wbinvl1_vol 2338; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2339; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2340; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 2341; GCN2-NEXT: s_cbranch_execnz .LBB57_1 2342; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2343; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 2344; GCN2-NEXT: s_setpc_b64 s[30:31] 2345; 2346; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar: 2347; GCN3: ; %bb.0: 2348; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2349; GCN3-NEXT: v_mov_b32_e32 v0, s4 2350; GCN3-NEXT: v_mov_b32_e32 v1, s5 2351; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 2352; GCN3-NEXT: v_mov_b32_e32 v1, s4 2353; GCN3-NEXT: s_mov_b64 s[34:35], 0 2354; GCN3-NEXT: v_mov_b32_e32 v2, s5 2355; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start 2356; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2357; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2358; GCN3-NEXT: v_mov_b32_e32 v4, v0 2359; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 2360; GCN3-NEXT: v_not_b32_e32 v3, v0 2361; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc 2362; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2363; GCN3-NEXT: buffer_wbinvl1_vol 2364; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 2365; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 2366; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 2367; GCN3-NEXT: s_cbranch_execnz .LBB57_1 2368; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2369; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 2370; GCN3-NEXT: s_setpc_b64 s[30:31] 2371 %gep = getelementptr i32, ptr %out, i32 4 2372 %result = atomicrmw nand ptr %gep, i32 %in seq_cst 2373 ret i32 %result 2374} 2375 2376define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 2377; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: 2378; GCN1: ; %bb.0: 2379; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2380; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2381; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2382; GCN1-NEXT: flat_load_dword v4, v[0:1] 2383; GCN1-NEXT: s_mov_b64 s[4:5], 0 2384; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start 2385; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 2386; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2387; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 2388; GCN1-NEXT: v_not_b32_e32 v3, v3 2389; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2390; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2391; GCN1-NEXT: buffer_wbinvl1_vol 2392; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2393; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2394; GCN1-NEXT: v_mov_b32_e32 v4, v3 2395; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 2396; GCN1-NEXT: s_cbranch_execnz .LBB58_1 2397; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2398; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 2399; GCN1-NEXT: s_setpc_b64 s[30:31] 2400; 2401; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: 2402; GCN2: ; %bb.0: 2403; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2404; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2405; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2406; GCN2-NEXT: flat_load_dword v4, v[0:1] 2407; GCN2-NEXT: s_mov_b64 s[4:5], 0 2408; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start 2409; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2410; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2411; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 2412; GCN2-NEXT: v_not_b32_e32 v3, v3 2413; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2414; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2415; GCN2-NEXT: buffer_wbinvl1_vol 2416; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2417; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2418; GCN2-NEXT: v_mov_b32_e32 v4, v3 2419; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 2420; GCN2-NEXT: s_cbranch_execnz .LBB58_1 2421; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2422; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 2423; GCN2-NEXT: s_setpc_b64 s[30:31] 2424; 2425; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: 2426; GCN3: ; %bb.0: 2427; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2428; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 2429; GCN3-NEXT: s_mov_b64 s[4:5], 0 2430; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start 2431; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2432; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2433; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 2434; GCN3-NEXT: v_not_b32_e32 v3, v3 2435; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 2436; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2437; GCN3-NEXT: buffer_wbinvl1_vol 2438; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2439; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2440; GCN3-NEXT: v_mov_b32_e32 v4, v3 2441; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 2442; GCN3-NEXT: s_cbranch_execnz .LBB58_1 2443; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2444; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 2445; GCN3-NEXT: s_setpc_b64 s[30:31] 2446 %gep = getelementptr i32, ptr %out, i64 4 2447 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2448 ret void 2449} 2450 2451define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 2452; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: 2453; GCN1: ; %bb.0: 2454; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2455; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 2456; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2457; GCN1-NEXT: flat_load_dword v0, v[3:4] 2458; GCN1-NEXT: s_mov_b64 s[4:5], 0 2459; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start 2460; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 2461; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2462; GCN1-NEXT: v_mov_b32_e32 v1, v0 2463; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 2464; GCN1-NEXT: v_not_b32_e32 v0, v0 2465; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2466; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2467; GCN1-NEXT: buffer_wbinvl1_vol 2468; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2469; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2470; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 2471; GCN1-NEXT: s_cbranch_execnz .LBB59_1 2472; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 2473; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 2474; GCN1-NEXT: s_setpc_b64 s[30:31] 2475; 2476; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: 2477; GCN2: ; %bb.0: 2478; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2479; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 2480; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2481; GCN2-NEXT: flat_load_dword v0, v[3:4] 2482; GCN2-NEXT: s_mov_b64 s[4:5], 0 2483; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start 2484; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 2485; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2486; GCN2-NEXT: v_mov_b32_e32 v1, v0 2487; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 2488; GCN2-NEXT: v_not_b32_e32 v0, v0 2489; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2490; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2491; GCN2-NEXT: buffer_wbinvl1_vol 2492; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2493; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2494; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 2495; GCN2-NEXT: s_cbranch_execnz .LBB59_1 2496; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 2497; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 2498; GCN2-NEXT: s_setpc_b64 s[30:31] 2499; 2500; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: 2501; GCN3: ; %bb.0: 2502; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2503; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 2504; GCN3-NEXT: s_mov_b64 s[4:5], 0 2505; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start 2506; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 2507; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2508; GCN3-NEXT: v_mov_b32_e32 v4, v3 2509; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 2510; GCN3-NEXT: v_not_b32_e32 v3, v3 2511; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 2512; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2513; GCN3-NEXT: buffer_wbinvl1_vol 2514; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2515; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2516; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 2517; GCN3-NEXT: s_cbranch_execnz .LBB59_1 2518; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 2519; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 2520; GCN3-NEXT: v_mov_b32_e32 v0, v3 2521; GCN3-NEXT: s_setpc_b64 s[30:31] 2522 %gep = getelementptr i32, ptr %out, i64 4 2523 %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2524 ret i32 %result 2525} 2526 2527; --------------------------------------------------------------------- 2528; atomicrmw or 2529; --------------------------------------------------------------------- 2530 2531define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) { 2532; GCN1-LABEL: flat_atomic_or_i32_noret: 2533; GCN1: ; %bb.0: 2534; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2535; GCN1-NEXT: flat_atomic_or v[0:1], v2 2536; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2537; GCN1-NEXT: buffer_wbinvl1_vol 2538; GCN1-NEXT: s_setpc_b64 s[30:31] 2539; 2540; GCN2-LABEL: flat_atomic_or_i32_noret: 2541; GCN2: ; %bb.0: 2542; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2543; GCN2-NEXT: flat_atomic_or v[0:1], v2 2544; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2545; GCN2-NEXT: buffer_wbinvl1_vol 2546; GCN2-NEXT: s_setpc_b64 s[30:31] 2547; 2548; GCN3-LABEL: flat_atomic_or_i32_noret: 2549; GCN3: ; %bb.0: 2550; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2551; GCN3-NEXT: flat_atomic_or v[0:1], v2 2552; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2553; GCN3-NEXT: buffer_wbinvl1_vol 2554; GCN3-NEXT: s_setpc_b64 s[30:31] 2555 %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst 2556 ret void 2557} 2558 2559define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { 2560; GCN1-LABEL: flat_atomic_or_i32_noret_offset: 2561; GCN1: ; %bb.0: 2562; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2563; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2564; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2565; GCN1-NEXT: flat_atomic_or v[0:1], v2 2566; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2567; GCN1-NEXT: buffer_wbinvl1_vol 2568; GCN1-NEXT: s_setpc_b64 s[30:31] 2569; 2570; GCN2-LABEL: flat_atomic_or_i32_noret_offset: 2571; GCN2: ; %bb.0: 2572; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2573; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2574; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2575; GCN2-NEXT: flat_atomic_or v[0:1], v2 2576; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2577; GCN2-NEXT: buffer_wbinvl1_vol 2578; GCN2-NEXT: s_setpc_b64 s[30:31] 2579; 2580; GCN3-LABEL: flat_atomic_or_i32_noret_offset: 2581; GCN3: ; %bb.0: 2582; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2583; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 2584; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2585; GCN3-NEXT: buffer_wbinvl1_vol 2586; GCN3-NEXT: s_setpc_b64 s[30:31] 2587 %gep = getelementptr i32, ptr %out, i32 4 2588 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst 2589 ret void 2590} 2591 2592define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) { 2593; GCN1-LABEL: flat_atomic_or_i32_ret: 2594; GCN1: ; %bb.0: 2595; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2596; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2597; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2598; GCN1-NEXT: buffer_wbinvl1_vol 2599; GCN1-NEXT: s_setpc_b64 s[30:31] 2600; 2601; GCN2-LABEL: flat_atomic_or_i32_ret: 2602; GCN2: ; %bb.0: 2603; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2604; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2605; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2606; GCN2-NEXT: buffer_wbinvl1_vol 2607; GCN2-NEXT: s_setpc_b64 s[30:31] 2608; 2609; GCN3-LABEL: flat_atomic_or_i32_ret: 2610; GCN3: ; %bb.0: 2611; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2612; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2613; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2614; GCN3-NEXT: buffer_wbinvl1_vol 2615; GCN3-NEXT: s_setpc_b64 s[30:31] 2616 %result = atomicrmw or ptr %ptr, i32 %in seq_cst 2617 ret i32 %result 2618} 2619 2620define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { 2621; GCN1-LABEL: flat_atomic_or_i32_ret_offset: 2622; GCN1: ; %bb.0: 2623; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2624; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2625; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2626; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2627; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2628; GCN1-NEXT: buffer_wbinvl1_vol 2629; GCN1-NEXT: s_setpc_b64 s[30:31] 2630; 2631; GCN2-LABEL: flat_atomic_or_i32_ret_offset: 2632; GCN2: ; %bb.0: 2633; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2634; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2635; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2636; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2637; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2638; GCN2-NEXT: buffer_wbinvl1_vol 2639; GCN2-NEXT: s_setpc_b64 s[30:31] 2640; 2641; GCN3-LABEL: flat_atomic_or_i32_ret_offset: 2642; GCN3: ; %bb.0: 2643; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2644; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc 2645; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2646; GCN3-NEXT: buffer_wbinvl1_vol 2647; GCN3-NEXT: s_setpc_b64 s[30:31] 2648 %gep = getelementptr i32, ptr %out, i32 4 2649 %result = atomicrmw or ptr %gep, i32 %in seq_cst 2650 ret i32 %result 2651} 2652 2653define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 2654; GCN1-LABEL: flat_atomic_or_i32_noret_scalar: 2655; GCN1: ; %bb.0: 2656; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2657; GCN1-NEXT: v_mov_b32_e32 v0, s4 2658; GCN1-NEXT: v_mov_b32_e32 v1, s5 2659; GCN1-NEXT: v_mov_b32_e32 v2, s6 2660; GCN1-NEXT: flat_atomic_or v[0:1], v2 2661; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2662; GCN1-NEXT: buffer_wbinvl1_vol 2663; GCN1-NEXT: s_setpc_b64 s[30:31] 2664; 2665; GCN2-LABEL: flat_atomic_or_i32_noret_scalar: 2666; GCN2: ; %bb.0: 2667; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2668; GCN2-NEXT: v_mov_b32_e32 v0, s4 2669; GCN2-NEXT: v_mov_b32_e32 v1, s5 2670; GCN2-NEXT: v_mov_b32_e32 v2, s6 2671; GCN2-NEXT: flat_atomic_or v[0:1], v2 2672; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2673; GCN2-NEXT: buffer_wbinvl1_vol 2674; GCN2-NEXT: s_setpc_b64 s[30:31] 2675; 2676; GCN3-LABEL: flat_atomic_or_i32_noret_scalar: 2677; GCN3: ; %bb.0: 2678; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2679; GCN3-NEXT: v_mov_b32_e32 v0, s4 2680; GCN3-NEXT: v_mov_b32_e32 v1, s5 2681; GCN3-NEXT: v_mov_b32_e32 v2, s6 2682; GCN3-NEXT: flat_atomic_or v[0:1], v2 2683; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2684; GCN3-NEXT: buffer_wbinvl1_vol 2685; GCN3-NEXT: s_setpc_b64 s[30:31] 2686 %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst 2687 ret void 2688} 2689 2690define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 2691; GCN1-LABEL: flat_atomic_or_i32_noret_offset_scalar: 2692; GCN1: ; %bb.0: 2693; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2694; GCN1-NEXT: s_add_u32 s34, s4, 16 2695; GCN1-NEXT: s_addc_u32 s35, s5, 0 2696; GCN1-NEXT: v_mov_b32_e32 v0, s34 2697; GCN1-NEXT: v_mov_b32_e32 v1, s35 2698; GCN1-NEXT: v_mov_b32_e32 v2, s6 2699; GCN1-NEXT: flat_atomic_or v[0:1], v2 2700; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2701; GCN1-NEXT: buffer_wbinvl1_vol 2702; GCN1-NEXT: s_setpc_b64 s[30:31] 2703; 2704; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar: 2705; GCN2: ; %bb.0: 2706; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2707; GCN2-NEXT: s_add_u32 s34, s4, 16 2708; GCN2-NEXT: s_addc_u32 s35, s5, 0 2709; GCN2-NEXT: v_mov_b32_e32 v0, s34 2710; GCN2-NEXT: v_mov_b32_e32 v1, s35 2711; GCN2-NEXT: v_mov_b32_e32 v2, s6 2712; GCN2-NEXT: flat_atomic_or v[0:1], v2 2713; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2714; GCN2-NEXT: buffer_wbinvl1_vol 2715; GCN2-NEXT: s_setpc_b64 s[30:31] 2716; 2717; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar: 2718; GCN3: ; %bb.0: 2719; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2720; GCN3-NEXT: v_mov_b32_e32 v0, s4 2721; GCN3-NEXT: v_mov_b32_e32 v1, s5 2722; GCN3-NEXT: v_mov_b32_e32 v2, s6 2723; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 2724; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2725; GCN3-NEXT: buffer_wbinvl1_vol 2726; GCN3-NEXT: s_setpc_b64 s[30:31] 2727 %gep = getelementptr i32, ptr %out, i32 4 2728 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst 2729 ret void 2730} 2731 2732define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 2733; GCN1-LABEL: flat_atomic_or_i32_ret_scalar: 2734; GCN1: ; %bb.0: 2735; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2736; GCN1-NEXT: v_mov_b32_e32 v0, s4 2737; GCN1-NEXT: v_mov_b32_e32 v1, s5 2738; GCN1-NEXT: v_mov_b32_e32 v2, s6 2739; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2740; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2741; GCN1-NEXT: buffer_wbinvl1_vol 2742; GCN1-NEXT: s_setpc_b64 s[30:31] 2743; 2744; GCN2-LABEL: flat_atomic_or_i32_ret_scalar: 2745; GCN2: ; %bb.0: 2746; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2747; GCN2-NEXT: v_mov_b32_e32 v0, s4 2748; GCN2-NEXT: v_mov_b32_e32 v1, s5 2749; GCN2-NEXT: v_mov_b32_e32 v2, s6 2750; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2751; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2752; GCN2-NEXT: buffer_wbinvl1_vol 2753; GCN2-NEXT: s_setpc_b64 s[30:31] 2754; 2755; GCN3-LABEL: flat_atomic_or_i32_ret_scalar: 2756; GCN3: ; %bb.0: 2757; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2758; GCN3-NEXT: v_mov_b32_e32 v0, s4 2759; GCN3-NEXT: v_mov_b32_e32 v1, s5 2760; GCN3-NEXT: v_mov_b32_e32 v2, s6 2761; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2762; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2763; GCN3-NEXT: buffer_wbinvl1_vol 2764; GCN3-NEXT: s_setpc_b64 s[30:31] 2765 %result = atomicrmw or ptr %ptr, i32 %in seq_cst 2766 ret i32 %result 2767} 2768 2769define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 2770; GCN1-LABEL: flat_atomic_or_i32_ret_offset_scalar: 2771; GCN1: ; %bb.0: 2772; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2773; GCN1-NEXT: s_add_u32 s34, s4, 16 2774; GCN1-NEXT: s_addc_u32 s35, s5, 0 2775; GCN1-NEXT: v_mov_b32_e32 v0, s34 2776; GCN1-NEXT: v_mov_b32_e32 v1, s35 2777; GCN1-NEXT: v_mov_b32_e32 v2, s6 2778; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2779; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2780; GCN1-NEXT: buffer_wbinvl1_vol 2781; GCN1-NEXT: s_setpc_b64 s[30:31] 2782; 2783; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar: 2784; GCN2: ; %bb.0: 2785; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2786; GCN2-NEXT: s_add_u32 s34, s4, 16 2787; GCN2-NEXT: s_addc_u32 s35, s5, 0 2788; GCN2-NEXT: v_mov_b32_e32 v0, s34 2789; GCN2-NEXT: v_mov_b32_e32 v1, s35 2790; GCN2-NEXT: v_mov_b32_e32 v2, s6 2791; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2792; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2793; GCN2-NEXT: buffer_wbinvl1_vol 2794; GCN2-NEXT: s_setpc_b64 s[30:31] 2795; 2796; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar: 2797; GCN3: ; %bb.0: 2798; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2799; GCN3-NEXT: v_mov_b32_e32 v0, s4 2800; GCN3-NEXT: v_mov_b32_e32 v1, s5 2801; GCN3-NEXT: v_mov_b32_e32 v2, s6 2802; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc 2803; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2804; GCN3-NEXT: buffer_wbinvl1_vol 2805; GCN3-NEXT: s_setpc_b64 s[30:31] 2806 %gep = getelementptr i32, ptr %out, i32 4 2807 %result = atomicrmw or ptr %gep, i32 %in seq_cst 2808 ret i32 %result 2809} 2810 2811define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 2812; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: 2813; GCN1: ; %bb.0: 2814; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2815; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2816; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2817; GCN1-NEXT: flat_atomic_or v[0:1], v2 2818; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2819; GCN1-NEXT: buffer_wbinvl1_vol 2820; GCN1-NEXT: s_setpc_b64 s[30:31] 2821; 2822; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: 2823; GCN2: ; %bb.0: 2824; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2825; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2826; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2827; GCN2-NEXT: flat_atomic_or v[0:1], v2 2828; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2829; GCN2-NEXT: buffer_wbinvl1_vol 2830; GCN2-NEXT: s_setpc_b64 s[30:31] 2831; 2832; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: 2833; GCN3: ; %bb.0: 2834; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2835; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 2836; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2837; GCN3-NEXT: buffer_wbinvl1_vol 2838; GCN3-NEXT: s_setpc_b64 s[30:31] 2839 %gep = getelementptr i32, ptr %out, i64 4 2840 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2841 ret void 2842} 2843 2844define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 2845; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: 2846; GCN1: ; %bb.0: 2847; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2848; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2849; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2850; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2851; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2852; GCN1-NEXT: buffer_wbinvl1_vol 2853; GCN1-NEXT: s_setpc_b64 s[30:31] 2854; 2855; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: 2856; GCN2: ; %bb.0: 2857; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2858; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2859; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2860; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc 2861; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2862; GCN2-NEXT: buffer_wbinvl1_vol 2863; GCN2-NEXT: s_setpc_b64 s[30:31] 2864; 2865; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: 2866; GCN3: ; %bb.0: 2867; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2868; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc 2869; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2870; GCN3-NEXT: buffer_wbinvl1_vol 2871; GCN3-NEXT: s_setpc_b64 s[30:31] 2872 %gep = getelementptr i32, ptr %out, i64 4 2873 %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 2874 ret i32 %result 2875} 2876 2877; --------------------------------------------------------------------- 2878; atomicrmw xor 2879; --------------------------------------------------------------------- 2880 2881define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { 2882; GCN1-LABEL: flat_atomic_xor_i32_noret: 2883; GCN1: ; %bb.0: 2884; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2885; GCN1-NEXT: flat_atomic_xor v[0:1], v2 2886; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2887; GCN1-NEXT: buffer_wbinvl1_vol 2888; GCN1-NEXT: s_setpc_b64 s[30:31] 2889; 2890; GCN2-LABEL: flat_atomic_xor_i32_noret: 2891; GCN2: ; %bb.0: 2892; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2893; GCN2-NEXT: flat_atomic_xor v[0:1], v2 2894; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2895; GCN2-NEXT: buffer_wbinvl1_vol 2896; GCN2-NEXT: s_setpc_b64 s[30:31] 2897; 2898; GCN3-LABEL: flat_atomic_xor_i32_noret: 2899; GCN3: ; %bb.0: 2900; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2901; GCN3-NEXT: flat_atomic_xor v[0:1], v2 2902; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2903; GCN3-NEXT: buffer_wbinvl1_vol 2904; GCN3-NEXT: s_setpc_b64 s[30:31] 2905 %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst 2906 ret void 2907} 2908 2909define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { 2910; GCN1-LABEL: flat_atomic_xor_i32_noret_offset: 2911; GCN1: ; %bb.0: 2912; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2913; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2914; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2915; GCN1-NEXT: flat_atomic_xor v[0:1], v2 2916; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2917; GCN1-NEXT: buffer_wbinvl1_vol 2918; GCN1-NEXT: s_setpc_b64 s[30:31] 2919; 2920; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: 2921; GCN2: ; %bb.0: 2922; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2923; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2924; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2925; GCN2-NEXT: flat_atomic_xor v[0:1], v2 2926; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2927; GCN2-NEXT: buffer_wbinvl1_vol 2928; GCN2-NEXT: s_setpc_b64 s[30:31] 2929; 2930; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: 2931; GCN3: ; %bb.0: 2932; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2933; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 2934; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2935; GCN3-NEXT: buffer_wbinvl1_vol 2936; GCN3-NEXT: s_setpc_b64 s[30:31] 2937 %gep = getelementptr i32, ptr %out, i32 4 2938 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst 2939 ret void 2940} 2941 2942define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { 2943; GCN1-LABEL: flat_atomic_xor_i32_ret: 2944; GCN1: ; %bb.0: 2945; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2946; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 2947; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2948; GCN1-NEXT: buffer_wbinvl1_vol 2949; GCN1-NEXT: s_setpc_b64 s[30:31] 2950; 2951; GCN2-LABEL: flat_atomic_xor_i32_ret: 2952; GCN2: ; %bb.0: 2953; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2954; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 2955; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2956; GCN2-NEXT: buffer_wbinvl1_vol 2957; GCN2-NEXT: s_setpc_b64 s[30:31] 2958; 2959; GCN3-LABEL: flat_atomic_xor_i32_ret: 2960; GCN3: ; %bb.0: 2961; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2962; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 2963; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2964; GCN3-NEXT: buffer_wbinvl1_vol 2965; GCN3-NEXT: s_setpc_b64 s[30:31] 2966 %result = atomicrmw xor ptr %ptr, i32 %in seq_cst 2967 ret i32 %result 2968} 2969 2970define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { 2971; GCN1-LABEL: flat_atomic_xor_i32_ret_offset: 2972; GCN1: ; %bb.0: 2973; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2974; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 2975; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2976; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 2977; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2978; GCN1-NEXT: buffer_wbinvl1_vol 2979; GCN1-NEXT: s_setpc_b64 s[30:31] 2980; 2981; GCN2-LABEL: flat_atomic_xor_i32_ret_offset: 2982; GCN2: ; %bb.0: 2983; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2984; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 2985; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2986; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 2987; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2988; GCN2-NEXT: buffer_wbinvl1_vol 2989; GCN2-NEXT: s_setpc_b64 s[30:31] 2990; 2991; GCN3-LABEL: flat_atomic_xor_i32_ret_offset: 2992; GCN3: ; %bb.0: 2993; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2994; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc 2995; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2996; GCN3-NEXT: buffer_wbinvl1_vol 2997; GCN3-NEXT: s_setpc_b64 s[30:31] 2998 %gep = getelementptr i32, ptr %out, i32 4 2999 %result = atomicrmw xor ptr %gep, i32 %in seq_cst 3000 ret i32 %result 3001} 3002 3003define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 3004; GCN1-LABEL: flat_atomic_xor_i32_noret_scalar: 3005; GCN1: ; %bb.0: 3006; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3007; GCN1-NEXT: v_mov_b32_e32 v0, s4 3008; GCN1-NEXT: v_mov_b32_e32 v1, s5 3009; GCN1-NEXT: v_mov_b32_e32 v2, s6 3010; GCN1-NEXT: flat_atomic_xor v[0:1], v2 3011; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3012; GCN1-NEXT: buffer_wbinvl1_vol 3013; GCN1-NEXT: s_setpc_b64 s[30:31] 3014; 3015; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar: 3016; GCN2: ; %bb.0: 3017; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3018; GCN2-NEXT: v_mov_b32_e32 v0, s4 3019; GCN2-NEXT: v_mov_b32_e32 v1, s5 3020; GCN2-NEXT: v_mov_b32_e32 v2, s6 3021; GCN2-NEXT: flat_atomic_xor v[0:1], v2 3022; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3023; GCN2-NEXT: buffer_wbinvl1_vol 3024; GCN2-NEXT: s_setpc_b64 s[30:31] 3025; 3026; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar: 3027; GCN3: ; %bb.0: 3028; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3029; GCN3-NEXT: v_mov_b32_e32 v0, s4 3030; GCN3-NEXT: v_mov_b32_e32 v1, s5 3031; GCN3-NEXT: v_mov_b32_e32 v2, s6 3032; GCN3-NEXT: flat_atomic_xor v[0:1], v2 3033; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3034; GCN3-NEXT: buffer_wbinvl1_vol 3035; GCN3-NEXT: s_setpc_b64 s[30:31] 3036 %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst 3037 ret void 3038} 3039 3040define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 3041; GCN1-LABEL: flat_atomic_xor_i32_noret_offset_scalar: 3042; GCN1: ; %bb.0: 3043; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3044; GCN1-NEXT: s_add_u32 s34, s4, 16 3045; GCN1-NEXT: s_addc_u32 s35, s5, 0 3046; GCN1-NEXT: v_mov_b32_e32 v0, s34 3047; GCN1-NEXT: v_mov_b32_e32 v1, s35 3048; GCN1-NEXT: v_mov_b32_e32 v2, s6 3049; GCN1-NEXT: flat_atomic_xor v[0:1], v2 3050; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3051; GCN1-NEXT: buffer_wbinvl1_vol 3052; GCN1-NEXT: s_setpc_b64 s[30:31] 3053; 3054; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar: 3055; GCN2: ; %bb.0: 3056; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3057; GCN2-NEXT: s_add_u32 s34, s4, 16 3058; GCN2-NEXT: s_addc_u32 s35, s5, 0 3059; GCN2-NEXT: v_mov_b32_e32 v0, s34 3060; GCN2-NEXT: v_mov_b32_e32 v1, s35 3061; GCN2-NEXT: v_mov_b32_e32 v2, s6 3062; GCN2-NEXT: flat_atomic_xor v[0:1], v2 3063; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3064; GCN2-NEXT: buffer_wbinvl1_vol 3065; GCN2-NEXT: s_setpc_b64 s[30:31] 3066; 3067; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar: 3068; GCN3: ; %bb.0: 3069; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3070; GCN3-NEXT: v_mov_b32_e32 v0, s4 3071; GCN3-NEXT: v_mov_b32_e32 v1, s5 3072; GCN3-NEXT: v_mov_b32_e32 v2, s6 3073; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 3074; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3075; GCN3-NEXT: buffer_wbinvl1_vol 3076; GCN3-NEXT: s_setpc_b64 s[30:31] 3077 %gep = getelementptr i32, ptr %out, i32 4 3078 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst 3079 ret void 3080} 3081 3082define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 3083; GCN1-LABEL: flat_atomic_xor_i32_ret_scalar: 3084; GCN1: ; %bb.0: 3085; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3086; GCN1-NEXT: v_mov_b32_e32 v0, s4 3087; GCN1-NEXT: v_mov_b32_e32 v1, s5 3088; GCN1-NEXT: v_mov_b32_e32 v2, s6 3089; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3090; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3091; GCN1-NEXT: buffer_wbinvl1_vol 3092; GCN1-NEXT: s_setpc_b64 s[30:31] 3093; 3094; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar: 3095; GCN2: ; %bb.0: 3096; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3097; GCN2-NEXT: v_mov_b32_e32 v0, s4 3098; GCN2-NEXT: v_mov_b32_e32 v1, s5 3099; GCN2-NEXT: v_mov_b32_e32 v2, s6 3100; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3101; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3102; GCN2-NEXT: buffer_wbinvl1_vol 3103; GCN2-NEXT: s_setpc_b64 s[30:31] 3104; 3105; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar: 3106; GCN3: ; %bb.0: 3107; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3108; GCN3-NEXT: v_mov_b32_e32 v0, s4 3109; GCN3-NEXT: v_mov_b32_e32 v1, s5 3110; GCN3-NEXT: v_mov_b32_e32 v2, s6 3111; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3112; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3113; GCN3-NEXT: buffer_wbinvl1_vol 3114; GCN3-NEXT: s_setpc_b64 s[30:31] 3115 %result = atomicrmw xor ptr %ptr, i32 %in seq_cst 3116 ret i32 %result 3117} 3118 3119define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 3120; GCN1-LABEL: flat_atomic_xor_i32_ret_offset_scalar: 3121; GCN1: ; %bb.0: 3122; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3123; GCN1-NEXT: s_add_u32 s34, s4, 16 3124; GCN1-NEXT: s_addc_u32 s35, s5, 0 3125; GCN1-NEXT: v_mov_b32_e32 v0, s34 3126; GCN1-NEXT: v_mov_b32_e32 v1, s35 3127; GCN1-NEXT: v_mov_b32_e32 v2, s6 3128; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3129; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3130; GCN1-NEXT: buffer_wbinvl1_vol 3131; GCN1-NEXT: s_setpc_b64 s[30:31] 3132; 3133; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar: 3134; GCN2: ; %bb.0: 3135; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3136; GCN2-NEXT: s_add_u32 s34, s4, 16 3137; GCN2-NEXT: s_addc_u32 s35, s5, 0 3138; GCN2-NEXT: v_mov_b32_e32 v0, s34 3139; GCN2-NEXT: v_mov_b32_e32 v1, s35 3140; GCN2-NEXT: v_mov_b32_e32 v2, s6 3141; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3142; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3143; GCN2-NEXT: buffer_wbinvl1_vol 3144; GCN2-NEXT: s_setpc_b64 s[30:31] 3145; 3146; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar: 3147; GCN3: ; %bb.0: 3148; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3149; GCN3-NEXT: v_mov_b32_e32 v0, s4 3150; GCN3-NEXT: v_mov_b32_e32 v1, s5 3151; GCN3-NEXT: v_mov_b32_e32 v2, s6 3152; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc 3153; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3154; GCN3-NEXT: buffer_wbinvl1_vol 3155; GCN3-NEXT: s_setpc_b64 s[30:31] 3156 %gep = getelementptr i32, ptr %out, i32 4 3157 %result = atomicrmw xor ptr %gep, i32 %in seq_cst 3158 ret i32 %result 3159} 3160 3161define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 3162; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory: 3163; GCN1: ; %bb.0: 3164; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3165; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 3166; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3167; GCN1-NEXT: flat_atomic_xor v[0:1], v2 3168; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3169; GCN1-NEXT: buffer_wbinvl1_vol 3170; GCN1-NEXT: s_setpc_b64 s[30:31] 3171; 3172; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory: 3173; GCN2: ; %bb.0: 3174; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3175; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3176; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3177; GCN2-NEXT: flat_atomic_xor v[0:1], v2 3178; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3179; GCN2-NEXT: buffer_wbinvl1_vol 3180; GCN2-NEXT: s_setpc_b64 s[30:31] 3181; 3182; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory: 3183; GCN3: ; %bb.0: 3184; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3185; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 3186; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3187; GCN3-NEXT: buffer_wbinvl1_vol 3188; GCN3-NEXT: s_setpc_b64 s[30:31] 3189 %gep = getelementptr i32, ptr %out, i64 4 3190 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3191 ret void 3192} 3193 3194define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 3195; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: 3196; GCN1: ; %bb.0: 3197; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3198; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 3199; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3200; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3201; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3202; GCN1-NEXT: buffer_wbinvl1_vol 3203; GCN1-NEXT: s_setpc_b64 s[30:31] 3204; 3205; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: 3206; GCN2: ; %bb.0: 3207; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3208; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3209; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3210; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc 3211; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3212; GCN2-NEXT: buffer_wbinvl1_vol 3213; GCN2-NEXT: s_setpc_b64 s[30:31] 3214; 3215; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: 3216; GCN3: ; %bb.0: 3217; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3218; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc 3219; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3220; GCN3-NEXT: buffer_wbinvl1_vol 3221; GCN3-NEXT: s_setpc_b64 s[30:31] 3222 %gep = getelementptr i32, ptr %out, i64 4 3223 %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 3224 ret i32 %result 3225} 3226 3227; --------------------------------------------------------------------- 3228; atomicrmw max 3229; --------------------------------------------------------------------- 3230 3231define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { 3232; GCN1-LABEL: flat_atomic_max_i32_noret: 3233; GCN1: ; %bb.0: 3234; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3235; GCN1-NEXT: flat_load_dword v4, v[0:1] 3236; GCN1-NEXT: s_mov_b64 s[4:5], 0 3237; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start 3238; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3239; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3240; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 3241; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3242; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3243; GCN1-NEXT: buffer_wbinvl1_vol 3244; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3245; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3246; GCN1-NEXT: v_mov_b32_e32 v4, v3 3247; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 3248; GCN1-NEXT: s_cbranch_execnz .LBB80_1 3249; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3250; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 3251; GCN1-NEXT: s_setpc_b64 s[30:31] 3252; 3253; GCN2-LABEL: flat_atomic_max_i32_noret: 3254; GCN2: ; %bb.0: 3255; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3256; GCN2-NEXT: flat_load_dword v4, v[0:1] 3257; GCN2-NEXT: s_mov_b64 s[4:5], 0 3258; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start 3259; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3260; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3261; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 3262; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3263; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3264; GCN2-NEXT: buffer_wbinvl1_vol 3265; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3266; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3267; GCN2-NEXT: v_mov_b32_e32 v4, v3 3268; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 3269; GCN2-NEXT: s_cbranch_execnz .LBB80_1 3270; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3271; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 3272; GCN2-NEXT: s_setpc_b64 s[30:31] 3273; 3274; GCN3-LABEL: flat_atomic_max_i32_noret: 3275; GCN3: ; %bb.0: 3276; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3277; GCN3-NEXT: flat_load_dword v4, v[0:1] 3278; GCN3-NEXT: s_mov_b64 s[4:5], 0 3279; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start 3280; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3281; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3282; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 3283; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3284; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3285; GCN3-NEXT: buffer_wbinvl1_vol 3286; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3287; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3288; GCN3-NEXT: v_mov_b32_e32 v4, v3 3289; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 3290; GCN3-NEXT: s_cbranch_execnz .LBB80_1 3291; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3292; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 3293; GCN3-NEXT: s_setpc_b64 s[30:31] 3294 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst 3295 ret void 3296} 3297 3298define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { 3299; GCN1-LABEL: flat_atomic_max_i32_noret_offset: 3300; GCN1: ; %bb.0: 3301; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3302; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 3303; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3304; GCN1-NEXT: flat_load_dword v4, v[0:1] 3305; GCN1-NEXT: s_mov_b64 s[4:5], 0 3306; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start 3307; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3308; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3309; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 3310; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3311; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3312; GCN1-NEXT: buffer_wbinvl1_vol 3313; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3314; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3315; GCN1-NEXT: v_mov_b32_e32 v4, v3 3316; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 3317; GCN1-NEXT: s_cbranch_execnz .LBB81_1 3318; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3319; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 3320; GCN1-NEXT: s_setpc_b64 s[30:31] 3321; 3322; GCN2-LABEL: flat_atomic_max_i32_noret_offset: 3323; GCN2: ; %bb.0: 3324; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3325; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 3326; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3327; GCN2-NEXT: flat_load_dword v4, v[0:1] 3328; GCN2-NEXT: s_mov_b64 s[4:5], 0 3329; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start 3330; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3331; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3332; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 3333; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3334; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3335; GCN2-NEXT: buffer_wbinvl1_vol 3336; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3337; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3338; GCN2-NEXT: v_mov_b32_e32 v4, v3 3339; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 3340; GCN2-NEXT: s_cbranch_execnz .LBB81_1 3341; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3342; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 3343; GCN2-NEXT: s_setpc_b64 s[30:31] 3344; 3345; GCN3-LABEL: flat_atomic_max_i32_noret_offset: 3346; GCN3: ; %bb.0: 3347; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3348; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 3349; GCN3-NEXT: s_mov_b64 s[4:5], 0 3350; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start 3351; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3352; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3353; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 3354; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 3355; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3356; GCN3-NEXT: buffer_wbinvl1_vol 3357; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3358; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3359; GCN3-NEXT: v_mov_b32_e32 v4, v3 3360; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 3361; GCN3-NEXT: s_cbranch_execnz .LBB81_1 3362; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3363; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 3364; GCN3-NEXT: s_setpc_b64 s[30:31] 3365 %gep = getelementptr i32, ptr %out, i32 4 3366 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst 3367 ret void 3368} 3369 3370define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { 3371; GCN1-LABEL: flat_atomic_max_i32_ret: 3372; GCN1: ; %bb.0: 3373; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3374; GCN1-NEXT: flat_load_dword v3, v[0:1] 3375; GCN1-NEXT: s_mov_b64 s[4:5], 0 3376; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start 3377; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3378; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3379; GCN1-NEXT: v_mov_b32_e32 v4, v3 3380; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 3381; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3382; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3383; GCN1-NEXT: buffer_wbinvl1_vol 3384; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3385; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3386; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 3387; GCN1-NEXT: s_cbranch_execnz .LBB82_1 3388; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3389; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 3390; GCN1-NEXT: v_mov_b32_e32 v0, v3 3391; GCN1-NEXT: s_setpc_b64 s[30:31] 3392; 3393; GCN2-LABEL: flat_atomic_max_i32_ret: 3394; GCN2: ; %bb.0: 3395; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3396; GCN2-NEXT: flat_load_dword v3, v[0:1] 3397; GCN2-NEXT: s_mov_b64 s[4:5], 0 3398; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start 3399; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3400; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3401; GCN2-NEXT: v_mov_b32_e32 v4, v3 3402; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 3403; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3404; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3405; GCN2-NEXT: buffer_wbinvl1_vol 3406; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3407; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3408; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 3409; GCN2-NEXT: s_cbranch_execnz .LBB82_1 3410; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3411; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 3412; GCN2-NEXT: v_mov_b32_e32 v0, v3 3413; GCN2-NEXT: s_setpc_b64 s[30:31] 3414; 3415; GCN3-LABEL: flat_atomic_max_i32_ret: 3416; GCN3: ; %bb.0: 3417; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3418; GCN3-NEXT: flat_load_dword v3, v[0:1] 3419; GCN3-NEXT: s_mov_b64 s[4:5], 0 3420; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start 3421; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3422; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3423; GCN3-NEXT: v_mov_b32_e32 v4, v3 3424; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 3425; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3426; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3427; GCN3-NEXT: buffer_wbinvl1_vol 3428; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3429; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3430; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 3431; GCN3-NEXT: s_cbranch_execnz .LBB82_1 3432; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3433; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 3434; GCN3-NEXT: v_mov_b32_e32 v0, v3 3435; GCN3-NEXT: s_setpc_b64 s[30:31] 3436 %result = atomicrmw max ptr %ptr, i32 %in seq_cst 3437 ret i32 %result 3438} 3439 3440define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { 3441; GCN1-LABEL: flat_atomic_max_i32_ret_offset: 3442; GCN1: ; %bb.0: 3443; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3444; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 3445; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3446; GCN1-NEXT: flat_load_dword v0, v[3:4] 3447; GCN1-NEXT: s_mov_b64 s[4:5], 0 3448; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start 3449; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3450; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3451; GCN1-NEXT: v_mov_b32_e32 v1, v0 3452; GCN1-NEXT: v_max_i32_e32 v0, v1, v2 3453; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3454; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3455; GCN1-NEXT: buffer_wbinvl1_vol 3456; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 3457; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3458; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 3459; GCN1-NEXT: s_cbranch_execnz .LBB83_1 3460; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3461; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 3462; GCN1-NEXT: s_setpc_b64 s[30:31] 3463; 3464; GCN2-LABEL: flat_atomic_max_i32_ret_offset: 3465; GCN2: ; %bb.0: 3466; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3467; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 3468; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3469; GCN2-NEXT: flat_load_dword v0, v[3:4] 3470; GCN2-NEXT: s_mov_b64 s[4:5], 0 3471; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start 3472; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3473; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3474; GCN2-NEXT: v_mov_b32_e32 v1, v0 3475; GCN2-NEXT: v_max_i32_e32 v0, v1, v2 3476; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3477; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3478; GCN2-NEXT: buffer_wbinvl1_vol 3479; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 3480; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3481; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 3482; GCN2-NEXT: s_cbranch_execnz .LBB83_1 3483; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3484; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 3485; GCN2-NEXT: s_setpc_b64 s[30:31] 3486; 3487; GCN3-LABEL: flat_atomic_max_i32_ret_offset: 3488; GCN3: ; %bb.0: 3489; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3490; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 3491; GCN3-NEXT: s_mov_b64 s[4:5], 0 3492; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start 3493; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3494; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3495; GCN3-NEXT: v_mov_b32_e32 v4, v3 3496; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 3497; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 3498; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3499; GCN3-NEXT: buffer_wbinvl1_vol 3500; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3501; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3502; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 3503; GCN3-NEXT: s_cbranch_execnz .LBB83_1 3504; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3505; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 3506; GCN3-NEXT: v_mov_b32_e32 v0, v3 3507; GCN3-NEXT: s_setpc_b64 s[30:31] 3508 %gep = getelementptr i32, ptr %out, i32 4 3509 %result = atomicrmw max ptr %gep, i32 %in seq_cst 3510 ret i32 %result 3511} 3512 3513define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 3514; GCN1-LABEL: flat_atomic_max_i32_noret_scalar: 3515; GCN1: ; %bb.0: 3516; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3517; GCN1-NEXT: v_mov_b32_e32 v0, s4 3518; GCN1-NEXT: v_mov_b32_e32 v1, s5 3519; GCN1-NEXT: flat_load_dword v3, v[0:1] 3520; GCN1-NEXT: s_mov_b64 s[34:35], 0 3521; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start 3522; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3523; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3524; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 3525; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3526; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3527; GCN1-NEXT: buffer_wbinvl1_vol 3528; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3529; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3530; GCN1-NEXT: v_mov_b32_e32 v3, v2 3531; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 3532; GCN1-NEXT: s_cbranch_execnz .LBB84_1 3533; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3534; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 3535; GCN1-NEXT: s_setpc_b64 s[30:31] 3536; 3537; GCN2-LABEL: flat_atomic_max_i32_noret_scalar: 3538; GCN2: ; %bb.0: 3539; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3540; GCN2-NEXT: v_mov_b32_e32 v0, s4 3541; GCN2-NEXT: v_mov_b32_e32 v1, s5 3542; GCN2-NEXT: flat_load_dword v3, v[0:1] 3543; GCN2-NEXT: s_mov_b64 s[34:35], 0 3544; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start 3545; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3546; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3547; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 3548; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3549; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3550; GCN2-NEXT: buffer_wbinvl1_vol 3551; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3552; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3553; GCN2-NEXT: v_mov_b32_e32 v3, v2 3554; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 3555; GCN2-NEXT: s_cbranch_execnz .LBB84_1 3556; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3557; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 3558; GCN2-NEXT: s_setpc_b64 s[30:31] 3559; 3560; GCN3-LABEL: flat_atomic_max_i32_noret_scalar: 3561; GCN3: ; %bb.0: 3562; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3563; GCN3-NEXT: v_mov_b32_e32 v0, s4 3564; GCN3-NEXT: v_mov_b32_e32 v1, s5 3565; GCN3-NEXT: flat_load_dword v3, v[0:1] 3566; GCN3-NEXT: s_mov_b64 s[34:35], 0 3567; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start 3568; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3569; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3570; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 3571; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3572; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3573; GCN3-NEXT: buffer_wbinvl1_vol 3574; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3575; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3576; GCN3-NEXT: v_mov_b32_e32 v3, v2 3577; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 3578; GCN3-NEXT: s_cbranch_execnz .LBB84_1 3579; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3580; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 3581; GCN3-NEXT: s_setpc_b64 s[30:31] 3582 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst 3583 ret void 3584} 3585 3586define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 3587; GCN1-LABEL: flat_atomic_max_i32_noret_offset_scalar: 3588; GCN1: ; %bb.0: 3589; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3590; GCN1-NEXT: s_add_u32 s34, s4, 16 3591; GCN1-NEXT: s_addc_u32 s35, s5, 0 3592; GCN1-NEXT: v_mov_b32_e32 v0, s34 3593; GCN1-NEXT: v_mov_b32_e32 v1, s35 3594; GCN1-NEXT: flat_load_dword v3, v[0:1] 3595; GCN1-NEXT: s_mov_b64 s[34:35], 0 3596; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start 3597; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3598; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3599; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 3600; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3601; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3602; GCN1-NEXT: buffer_wbinvl1_vol 3603; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3604; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3605; GCN1-NEXT: v_mov_b32_e32 v3, v2 3606; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 3607; GCN1-NEXT: s_cbranch_execnz .LBB85_1 3608; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3609; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 3610; GCN1-NEXT: s_setpc_b64 s[30:31] 3611; 3612; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar: 3613; GCN2: ; %bb.0: 3614; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3615; GCN2-NEXT: s_add_u32 s34, s4, 16 3616; GCN2-NEXT: s_addc_u32 s35, s5, 0 3617; GCN2-NEXT: v_mov_b32_e32 v0, s34 3618; GCN2-NEXT: v_mov_b32_e32 v1, s35 3619; GCN2-NEXT: flat_load_dword v3, v[0:1] 3620; GCN2-NEXT: s_mov_b64 s[34:35], 0 3621; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start 3622; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3623; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3624; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 3625; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3626; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3627; GCN2-NEXT: buffer_wbinvl1_vol 3628; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3629; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3630; GCN2-NEXT: v_mov_b32_e32 v3, v2 3631; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 3632; GCN2-NEXT: s_cbranch_execnz .LBB85_1 3633; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3634; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 3635; GCN2-NEXT: s_setpc_b64 s[30:31] 3636; 3637; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar: 3638; GCN3: ; %bb.0: 3639; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3640; GCN3-NEXT: v_mov_b32_e32 v0, s4 3641; GCN3-NEXT: v_mov_b32_e32 v1, s5 3642; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 3643; GCN3-NEXT: s_mov_b64 s[34:35], 0 3644; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start 3645; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3646; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3647; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 3648; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3649; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3650; GCN3-NEXT: buffer_wbinvl1_vol 3651; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3652; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3653; GCN3-NEXT: v_mov_b32_e32 v3, v2 3654; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 3655; GCN3-NEXT: s_cbranch_execnz .LBB85_1 3656; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3657; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 3658; GCN3-NEXT: s_setpc_b64 s[30:31] 3659 %gep = getelementptr i32, ptr %out, i32 4 3660 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst 3661 ret void 3662} 3663 3664define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 3665; GCN1-LABEL: flat_atomic_max_i32_ret_scalar: 3666; GCN1: ; %bb.0: 3667; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3668; GCN1-NEXT: v_mov_b32_e32 v0, s4 3669; GCN1-NEXT: v_mov_b32_e32 v1, s5 3670; GCN1-NEXT: flat_load_dword v0, v[0:1] 3671; GCN1-NEXT: v_mov_b32_e32 v1, s4 3672; GCN1-NEXT: s_mov_b64 s[34:35], 0 3673; GCN1-NEXT: v_mov_b32_e32 v2, s5 3674; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start 3675; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3676; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3677; GCN1-NEXT: v_mov_b32_e32 v4, v0 3678; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 3679; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 3680; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3681; GCN1-NEXT: buffer_wbinvl1_vol 3682; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 3683; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3684; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 3685; GCN1-NEXT: s_cbranch_execnz .LBB86_1 3686; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3687; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 3688; GCN1-NEXT: s_setpc_b64 s[30:31] 3689; 3690; GCN2-LABEL: flat_atomic_max_i32_ret_scalar: 3691; GCN2: ; %bb.0: 3692; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3693; GCN2-NEXT: v_mov_b32_e32 v0, s4 3694; GCN2-NEXT: v_mov_b32_e32 v1, s5 3695; GCN2-NEXT: flat_load_dword v0, v[0:1] 3696; GCN2-NEXT: v_mov_b32_e32 v1, s4 3697; GCN2-NEXT: s_mov_b64 s[34:35], 0 3698; GCN2-NEXT: v_mov_b32_e32 v2, s5 3699; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start 3700; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3701; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3702; GCN2-NEXT: v_mov_b32_e32 v4, v0 3703; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 3704; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 3705; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3706; GCN2-NEXT: buffer_wbinvl1_vol 3707; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 3708; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3709; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 3710; GCN2-NEXT: s_cbranch_execnz .LBB86_1 3711; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3712; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 3713; GCN2-NEXT: s_setpc_b64 s[30:31] 3714; 3715; GCN3-LABEL: flat_atomic_max_i32_ret_scalar: 3716; GCN3: ; %bb.0: 3717; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3718; GCN3-NEXT: v_mov_b32_e32 v0, s4 3719; GCN3-NEXT: v_mov_b32_e32 v1, s5 3720; GCN3-NEXT: flat_load_dword v0, v[0:1] 3721; GCN3-NEXT: v_mov_b32_e32 v1, s4 3722; GCN3-NEXT: s_mov_b64 s[34:35], 0 3723; GCN3-NEXT: v_mov_b32_e32 v2, s5 3724; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start 3725; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3726; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3727; GCN3-NEXT: v_mov_b32_e32 v4, v0 3728; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 3729; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 3730; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3731; GCN3-NEXT: buffer_wbinvl1_vol 3732; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 3733; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3734; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 3735; GCN3-NEXT: s_cbranch_execnz .LBB86_1 3736; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3737; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 3738; GCN3-NEXT: s_setpc_b64 s[30:31] 3739 %result = atomicrmw max ptr %ptr, i32 %in seq_cst 3740 ret i32 %result 3741} 3742 3743define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 3744; GCN1-LABEL: flat_atomic_max_i32_ret_offset_scalar: 3745; GCN1: ; %bb.0: 3746; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3747; GCN1-NEXT: s_add_u32 s34, s4, 16 3748; GCN1-NEXT: s_addc_u32 s35, s5, 0 3749; GCN1-NEXT: v_mov_b32_e32 v1, s34 3750; GCN1-NEXT: v_mov_b32_e32 v2, s35 3751; GCN1-NEXT: flat_load_dword v0, v[1:2] 3752; GCN1-NEXT: s_mov_b64 s[34:35], 0 3753; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start 3754; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3755; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3756; GCN1-NEXT: v_mov_b32_e32 v4, v0 3757; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 3758; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 3759; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3760; GCN1-NEXT: buffer_wbinvl1_vol 3761; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 3762; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3763; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 3764; GCN1-NEXT: s_cbranch_execnz .LBB87_1 3765; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3766; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 3767; GCN1-NEXT: s_setpc_b64 s[30:31] 3768; 3769; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar: 3770; GCN2: ; %bb.0: 3771; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3772; GCN2-NEXT: s_add_u32 s34, s4, 16 3773; GCN2-NEXT: s_addc_u32 s35, s5, 0 3774; GCN2-NEXT: v_mov_b32_e32 v1, s34 3775; GCN2-NEXT: v_mov_b32_e32 v2, s35 3776; GCN2-NEXT: flat_load_dword v0, v[1:2] 3777; GCN2-NEXT: s_mov_b64 s[34:35], 0 3778; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start 3779; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3780; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3781; GCN2-NEXT: v_mov_b32_e32 v4, v0 3782; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 3783; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 3784; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3785; GCN2-NEXT: buffer_wbinvl1_vol 3786; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 3787; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3788; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 3789; GCN2-NEXT: s_cbranch_execnz .LBB87_1 3790; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3791; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 3792; GCN2-NEXT: s_setpc_b64 s[30:31] 3793; 3794; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar: 3795; GCN3: ; %bb.0: 3796; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3797; GCN3-NEXT: v_mov_b32_e32 v0, s4 3798; GCN3-NEXT: v_mov_b32_e32 v1, s5 3799; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 3800; GCN3-NEXT: v_mov_b32_e32 v1, s4 3801; GCN3-NEXT: s_mov_b64 s[34:35], 0 3802; GCN3-NEXT: v_mov_b32_e32 v2, s5 3803; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start 3804; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3805; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3806; GCN3-NEXT: v_mov_b32_e32 v4, v0 3807; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 3808; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc 3809; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3810; GCN3-NEXT: buffer_wbinvl1_vol 3811; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 3812; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 3813; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 3814; GCN3-NEXT: s_cbranch_execnz .LBB87_1 3815; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3816; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 3817; GCN3-NEXT: s_setpc_b64 s[30:31] 3818 %gep = getelementptr i32, ptr %out, i32 4 3819 %result = atomicrmw max ptr %gep, i32 %in seq_cst 3820 ret i32 %result 3821} 3822 3823define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { 3824; GCN1-LABEL: atomic_max_i32_addr64_offset: 3825; GCN1: ; %bb.0: ; %entry 3826; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3827; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3828; GCN1-NEXT: s_ashr_i32 s5, s3, 31 3829; GCN1-NEXT: s_mov_b32 s4, s3 3830; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 3831; GCN1-NEXT: s_add_u32 s0, s0, s4 3832; GCN1-NEXT: s_addc_u32 s1, s1, s5 3833; GCN1-NEXT: s_add_u32 s0, s0, 16 3834; GCN1-NEXT: s_addc_u32 s1, s1, 0 3835; GCN1-NEXT: v_mov_b32_e32 v0, s0 3836; GCN1-NEXT: v_mov_b32_e32 v1, s1 3837; GCN1-NEXT: flat_load_dword v3, v[0:1] 3838; GCN1-NEXT: s_mov_b64 s[0:1], 0 3839; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start 3840; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3841; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3842; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 3843; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3844; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3845; GCN1-NEXT: buffer_wbinvl1_vol 3846; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3847; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3848; GCN1-NEXT: v_mov_b32_e32 v3, v2 3849; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 3850; GCN1-NEXT: s_cbranch_execnz .LBB88_1 3851; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3852; GCN1-NEXT: s_endpgm 3853; 3854; GCN2-LABEL: atomic_max_i32_addr64_offset: 3855; GCN2: ; %bb.0: ; %entry 3856; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3857; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3858; GCN2-NEXT: s_ashr_i32 s5, s3, 31 3859; GCN2-NEXT: s_mov_b32 s4, s3 3860; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 3861; GCN2-NEXT: s_add_u32 s0, s0, s4 3862; GCN2-NEXT: s_addc_u32 s1, s1, s5 3863; GCN2-NEXT: s_add_u32 s0, s0, 16 3864; GCN2-NEXT: s_addc_u32 s1, s1, 0 3865; GCN2-NEXT: v_mov_b32_e32 v0, s0 3866; GCN2-NEXT: v_mov_b32_e32 v1, s1 3867; GCN2-NEXT: flat_load_dword v3, v[0:1] 3868; GCN2-NEXT: s_mov_b64 s[0:1], 0 3869; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start 3870; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3871; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3872; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 3873; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3874; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3875; GCN2-NEXT: buffer_wbinvl1_vol 3876; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3877; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3878; GCN2-NEXT: v_mov_b32_e32 v3, v2 3879; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 3880; GCN2-NEXT: s_cbranch_execnz .LBB88_1 3881; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3882; GCN2-NEXT: s_endpgm 3883; 3884; GCN3-LABEL: atomic_max_i32_addr64_offset: 3885; GCN3: ; %bb.0: ; %entry 3886; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3887; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3888; GCN3-NEXT: s_ashr_i32 s5, s3, 31 3889; GCN3-NEXT: s_mov_b32 s4, s3 3890; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 3891; GCN3-NEXT: s_add_u32 s0, s0, s4 3892; GCN3-NEXT: s_addc_u32 s1, s1, s5 3893; GCN3-NEXT: v_mov_b32_e32 v0, s0 3894; GCN3-NEXT: v_mov_b32_e32 v1, s1 3895; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 3896; GCN3-NEXT: s_mov_b64 s[0:1], 0 3897; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start 3898; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 3899; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3900; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 3901; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3902; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3903; GCN3-NEXT: buffer_wbinvl1_vol 3904; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3905; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3906; GCN3-NEXT: v_mov_b32_e32 v3, v2 3907; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 3908; GCN3-NEXT: s_cbranch_execnz .LBB88_1 3909; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 3910; GCN3-NEXT: s_endpgm 3911entry: 3912 %ptr = getelementptr i32, ptr %out, i32 %index 3913 %gep = getelementptr i32, ptr %ptr, i32 4 3914 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst 3915 ret void 3916} 3917 3918define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { 3919; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: 3920; GCN1: ; %bb.0: ; %entry 3921; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 3922; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3923; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3924; GCN1-NEXT: s_ashr_i32 s5, s7, 31 3925; GCN1-NEXT: s_mov_b32 s4, s7 3926; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 3927; GCN1-NEXT: s_add_u32 s0, s0, s4 3928; GCN1-NEXT: s_addc_u32 s1, s1, s5 3929; GCN1-NEXT: s_add_u32 s0, s0, 16 3930; GCN1-NEXT: s_addc_u32 s1, s1, 0 3931; GCN1-NEXT: v_mov_b32_e32 v0, s0 3932; GCN1-NEXT: v_mov_b32_e32 v1, s1 3933; GCN1-NEXT: flat_load_dword v2, v[0:1] 3934; GCN1-NEXT: s_mov_b64 s[0:1], 0 3935; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start 3936; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 3937; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3938; GCN1-NEXT: v_mov_b32_e32 v3, v2 3939; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 3940; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3941; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3942; GCN1-NEXT: buffer_wbinvl1_vol 3943; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3944; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3945; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 3946; GCN1-NEXT: s_cbranch_execnz .LBB89_1 3947; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 3948; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] 3949; GCN1-NEXT: v_mov_b32_e32 v0, s2 3950; GCN1-NEXT: v_mov_b32_e32 v1, s3 3951; GCN1-NEXT: flat_store_dword v[0:1], v2 3952; GCN1-NEXT: s_endpgm 3953; 3954; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: 3955; GCN2: ; %bb.0: ; %entry 3956; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3957; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3958; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3959; GCN2-NEXT: s_ashr_i32 s5, s7, 31 3960; GCN2-NEXT: s_mov_b32 s4, s7 3961; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 3962; GCN2-NEXT: s_add_u32 s0, s0, s4 3963; GCN2-NEXT: s_addc_u32 s1, s1, s5 3964; GCN2-NEXT: s_add_u32 s0, s0, 16 3965; GCN2-NEXT: s_addc_u32 s1, s1, 0 3966; GCN2-NEXT: v_mov_b32_e32 v0, s0 3967; GCN2-NEXT: v_mov_b32_e32 v1, s1 3968; GCN2-NEXT: flat_load_dword v2, v[0:1] 3969; GCN2-NEXT: s_mov_b64 s[0:1], 0 3970; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start 3971; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 3972; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3973; GCN2-NEXT: v_mov_b32_e32 v3, v2 3974; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 3975; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3976; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3977; GCN2-NEXT: buffer_wbinvl1_vol 3978; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 3979; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3980; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 3981; GCN2-NEXT: s_cbranch_execnz .LBB89_1 3982; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 3983; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] 3984; GCN2-NEXT: v_mov_b32_e32 v0, s2 3985; GCN2-NEXT: v_mov_b32_e32 v1, s3 3986; GCN2-NEXT: flat_store_dword v[0:1], v2 3987; GCN2-NEXT: s_endpgm 3988; 3989; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: 3990; GCN3: ; %bb.0: ; %entry 3991; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3992; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3993; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3994; GCN3-NEXT: s_ashr_i32 s5, s7, 31 3995; GCN3-NEXT: s_mov_b32 s4, s7 3996; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 3997; GCN3-NEXT: s_add_u32 s0, s0, s4 3998; GCN3-NEXT: s_addc_u32 s1, s1, s5 3999; GCN3-NEXT: v_mov_b32_e32 v0, s0 4000; GCN3-NEXT: v_mov_b32_e32 v1, s1 4001; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 4002; GCN3-NEXT: s_mov_b64 s[0:1], 0 4003; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start 4004; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4005; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4006; GCN3-NEXT: v_mov_b32_e32 v3, v2 4007; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 4008; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4009; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4010; GCN3-NEXT: buffer_wbinvl1_vol 4011; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4012; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4013; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 4014; GCN3-NEXT: s_cbranch_execnz .LBB89_1 4015; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4016; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] 4017; GCN3-NEXT: v_mov_b32_e32 v0, s2 4018; GCN3-NEXT: v_mov_b32_e32 v1, s3 4019; GCN3-NEXT: flat_store_dword v[0:1], v2 4020; GCN3-NEXT: s_endpgm 4021entry: 4022 %ptr = getelementptr i32, ptr %out, i32 %index 4023 %gep = getelementptr i32, ptr %ptr, i32 4 4024 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst 4025 store i32 %tmp0, ptr %out2 4026 ret void 4027} 4028 4029define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { 4030; GCN1-LABEL: atomic_max_i32_addr64: 4031; GCN1: ; %bb.0: ; %entry 4032; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4033; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4034; GCN1-NEXT: s_ashr_i32 s5, s3, 31 4035; GCN1-NEXT: s_mov_b32 s4, s3 4036; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4037; GCN1-NEXT: s_add_u32 s0, s0, s4 4038; GCN1-NEXT: s_addc_u32 s1, s1, s5 4039; GCN1-NEXT: v_mov_b32_e32 v0, s0 4040; GCN1-NEXT: v_mov_b32_e32 v1, s1 4041; GCN1-NEXT: flat_load_dword v3, v[0:1] 4042; GCN1-NEXT: s_mov_b64 s[0:1], 0 4043; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start 4044; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4045; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4046; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 4047; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4048; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4049; GCN1-NEXT: buffer_wbinvl1_vol 4050; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4051; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4052; GCN1-NEXT: v_mov_b32_e32 v3, v2 4053; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 4054; GCN1-NEXT: s_cbranch_execnz .LBB90_1 4055; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4056; GCN1-NEXT: s_endpgm 4057; 4058; GCN2-LABEL: atomic_max_i32_addr64: 4059; GCN2: ; %bb.0: ; %entry 4060; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4061; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4062; GCN2-NEXT: s_ashr_i32 s5, s3, 31 4063; GCN2-NEXT: s_mov_b32 s4, s3 4064; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4065; GCN2-NEXT: s_add_u32 s0, s0, s4 4066; GCN2-NEXT: s_addc_u32 s1, s1, s5 4067; GCN2-NEXT: v_mov_b32_e32 v0, s0 4068; GCN2-NEXT: v_mov_b32_e32 v1, s1 4069; GCN2-NEXT: flat_load_dword v3, v[0:1] 4070; GCN2-NEXT: s_mov_b64 s[0:1], 0 4071; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start 4072; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4073; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4074; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 4075; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4076; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4077; GCN2-NEXT: buffer_wbinvl1_vol 4078; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4079; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4080; GCN2-NEXT: v_mov_b32_e32 v3, v2 4081; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 4082; GCN2-NEXT: s_cbranch_execnz .LBB90_1 4083; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4084; GCN2-NEXT: s_endpgm 4085; 4086; GCN3-LABEL: atomic_max_i32_addr64: 4087; GCN3: ; %bb.0: ; %entry 4088; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4089; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4090; GCN3-NEXT: s_ashr_i32 s5, s3, 31 4091; GCN3-NEXT: s_mov_b32 s4, s3 4092; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4093; GCN3-NEXT: s_add_u32 s0, s0, s4 4094; GCN3-NEXT: s_addc_u32 s1, s1, s5 4095; GCN3-NEXT: v_mov_b32_e32 v0, s0 4096; GCN3-NEXT: v_mov_b32_e32 v1, s1 4097; GCN3-NEXT: flat_load_dword v3, v[0:1] 4098; GCN3-NEXT: s_mov_b64 s[0:1], 0 4099; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start 4100; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4101; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4102; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 4103; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4104; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4105; GCN3-NEXT: buffer_wbinvl1_vol 4106; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4107; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4108; GCN3-NEXT: v_mov_b32_e32 v3, v2 4109; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 4110; GCN3-NEXT: s_cbranch_execnz .LBB90_1 4111; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4112; GCN3-NEXT: s_endpgm 4113entry: 4114 %ptr = getelementptr i32, ptr %out, i32 %index 4115 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst 4116 ret void 4117} 4118 4119define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { 4120; GCN1-LABEL: atomic_max_i32_ret_addr64: 4121; GCN1: ; %bb.0: ; %entry 4122; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 4123; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4124; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4125; GCN1-NEXT: s_ashr_i32 s5, s7, 31 4126; GCN1-NEXT: s_mov_b32 s4, s7 4127; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4128; GCN1-NEXT: s_add_u32 s0, s0, s4 4129; GCN1-NEXT: s_addc_u32 s1, s1, s5 4130; GCN1-NEXT: v_mov_b32_e32 v0, s0 4131; GCN1-NEXT: v_mov_b32_e32 v1, s1 4132; GCN1-NEXT: flat_load_dword v2, v[0:1] 4133; GCN1-NEXT: s_mov_b64 s[0:1], 0 4134; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start 4135; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4136; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4137; GCN1-NEXT: v_mov_b32_e32 v3, v2 4138; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 4139; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4140; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4141; GCN1-NEXT: buffer_wbinvl1_vol 4142; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4143; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4144; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 4145; GCN1-NEXT: s_cbranch_execnz .LBB91_1 4146; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4147; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] 4148; GCN1-NEXT: v_mov_b32_e32 v0, s2 4149; GCN1-NEXT: v_mov_b32_e32 v1, s3 4150; GCN1-NEXT: flat_store_dword v[0:1], v2 4151; GCN1-NEXT: s_endpgm 4152; 4153; GCN2-LABEL: atomic_max_i32_ret_addr64: 4154; GCN2: ; %bb.0: ; %entry 4155; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4156; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4157; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4158; GCN2-NEXT: s_ashr_i32 s5, s7, 31 4159; GCN2-NEXT: s_mov_b32 s4, s7 4160; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4161; GCN2-NEXT: s_add_u32 s0, s0, s4 4162; GCN2-NEXT: s_addc_u32 s1, s1, s5 4163; GCN2-NEXT: v_mov_b32_e32 v0, s0 4164; GCN2-NEXT: v_mov_b32_e32 v1, s1 4165; GCN2-NEXT: flat_load_dword v2, v[0:1] 4166; GCN2-NEXT: s_mov_b64 s[0:1], 0 4167; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start 4168; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4169; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4170; GCN2-NEXT: v_mov_b32_e32 v3, v2 4171; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 4172; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4173; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4174; GCN2-NEXT: buffer_wbinvl1_vol 4175; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4176; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4177; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 4178; GCN2-NEXT: s_cbranch_execnz .LBB91_1 4179; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4180; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] 4181; GCN2-NEXT: v_mov_b32_e32 v0, s2 4182; GCN2-NEXT: v_mov_b32_e32 v1, s3 4183; GCN2-NEXT: flat_store_dword v[0:1], v2 4184; GCN2-NEXT: s_endpgm 4185; 4186; GCN3-LABEL: atomic_max_i32_ret_addr64: 4187; GCN3: ; %bb.0: ; %entry 4188; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4189; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4190; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4191; GCN3-NEXT: s_ashr_i32 s5, s7, 31 4192; GCN3-NEXT: s_mov_b32 s4, s7 4193; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4194; GCN3-NEXT: s_add_u32 s0, s0, s4 4195; GCN3-NEXT: s_addc_u32 s1, s1, s5 4196; GCN3-NEXT: v_mov_b32_e32 v0, s0 4197; GCN3-NEXT: v_mov_b32_e32 v1, s1 4198; GCN3-NEXT: flat_load_dword v2, v[0:1] 4199; GCN3-NEXT: s_mov_b64 s[0:1], 0 4200; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start 4201; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4202; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4203; GCN3-NEXT: v_mov_b32_e32 v3, v2 4204; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 4205; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4206; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4207; GCN3-NEXT: buffer_wbinvl1_vol 4208; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4209; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4210; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 4211; GCN3-NEXT: s_cbranch_execnz .LBB91_1 4212; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4213; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] 4214; GCN3-NEXT: v_mov_b32_e32 v0, s2 4215; GCN3-NEXT: v_mov_b32_e32 v1, s3 4216; GCN3-NEXT: flat_store_dword v[0:1], v2 4217; GCN3-NEXT: s_endpgm 4218entry: 4219 %ptr = getelementptr i32, ptr %out, i32 %index 4220 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst 4221 store i32 %tmp0, ptr %out2 4222 ret void 4223} 4224 4225define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 4226; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: 4227; GCN1: ; %bb.0: 4228; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4229; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 4230; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4231; GCN1-NEXT: flat_load_dword v4, v[0:1] 4232; GCN1-NEXT: s_mov_b64 s[4:5], 0 4233; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start 4234; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4235; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4236; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 4237; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4238; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4239; GCN1-NEXT: buffer_wbinvl1_vol 4240; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4241; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4242; GCN1-NEXT: v_mov_b32_e32 v4, v3 4243; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 4244; GCN1-NEXT: s_cbranch_execnz .LBB92_1 4245; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4246; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 4247; GCN1-NEXT: s_setpc_b64 s[30:31] 4248; 4249; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: 4250; GCN2: ; %bb.0: 4251; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4252; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 4253; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4254; GCN2-NEXT: flat_load_dword v4, v[0:1] 4255; GCN2-NEXT: s_mov_b64 s[4:5], 0 4256; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start 4257; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4258; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4259; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 4260; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4261; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4262; GCN2-NEXT: buffer_wbinvl1_vol 4263; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4264; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4265; GCN2-NEXT: v_mov_b32_e32 v4, v3 4266; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 4267; GCN2-NEXT: s_cbranch_execnz .LBB92_1 4268; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4269; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 4270; GCN2-NEXT: s_setpc_b64 s[30:31] 4271; 4272; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: 4273; GCN3: ; %bb.0: 4274; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4275; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 4276; GCN3-NEXT: s_mov_b64 s[4:5], 0 4277; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start 4278; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4279; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4280; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 4281; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 4282; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4283; GCN3-NEXT: buffer_wbinvl1_vol 4284; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4285; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4286; GCN3-NEXT: v_mov_b32_e32 v4, v3 4287; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 4288; GCN3-NEXT: s_cbranch_execnz .LBB92_1 4289; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4290; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 4291; GCN3-NEXT: s_setpc_b64 s[30:31] 4292 %gep = getelementptr i32, ptr %out, i64 4 4293 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 4294 ret void 4295} 4296 4297define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 4298; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: 4299; GCN1: ; %bb.0: 4300; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4301; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 4302; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4303; GCN1-NEXT: flat_load_dword v0, v[3:4] 4304; GCN1-NEXT: s_mov_b64 s[4:5], 0 4305; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start 4306; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4307; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4308; GCN1-NEXT: v_mov_b32_e32 v1, v0 4309; GCN1-NEXT: v_max_i32_e32 v0, v1, v2 4310; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4311; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4312; GCN1-NEXT: buffer_wbinvl1_vol 4313; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4314; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4315; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 4316; GCN1-NEXT: s_cbranch_execnz .LBB93_1 4317; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4318; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 4319; GCN1-NEXT: s_setpc_b64 s[30:31] 4320; 4321; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: 4322; GCN2: ; %bb.0: 4323; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4324; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 4325; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4326; GCN2-NEXT: flat_load_dword v0, v[3:4] 4327; GCN2-NEXT: s_mov_b64 s[4:5], 0 4328; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start 4329; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4330; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4331; GCN2-NEXT: v_mov_b32_e32 v1, v0 4332; GCN2-NEXT: v_max_i32_e32 v0, v1, v2 4333; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4334; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4335; GCN2-NEXT: buffer_wbinvl1_vol 4336; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4337; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4338; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 4339; GCN2-NEXT: s_cbranch_execnz .LBB93_1 4340; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4341; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 4342; GCN2-NEXT: s_setpc_b64 s[30:31] 4343; 4344; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: 4345; GCN3: ; %bb.0: 4346; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4347; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 4348; GCN3-NEXT: s_mov_b64 s[4:5], 0 4349; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start 4350; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4351; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4352; GCN3-NEXT: v_mov_b32_e32 v4, v3 4353; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 4354; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 4355; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4356; GCN3-NEXT: buffer_wbinvl1_vol 4357; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4358; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4359; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 4360; GCN3-NEXT: s_cbranch_execnz .LBB93_1 4361; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4362; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 4363; GCN3-NEXT: v_mov_b32_e32 v0, v3 4364; GCN3-NEXT: s_setpc_b64 s[30:31] 4365 %gep = getelementptr i32, ptr %out, i64 4 4366 %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 4367 ret i32 %result 4368} 4369 4370; --------------------------------------------------------------------- 4371; atomicrmw umax 4372; --------------------------------------------------------------------- 4373 4374define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { 4375; GCN1-LABEL: flat_atomic_umax_i32_noret: 4376; GCN1: ; %bb.0: 4377; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4378; GCN1-NEXT: flat_load_dword v4, v[0:1] 4379; GCN1-NEXT: s_mov_b64 s[4:5], 0 4380; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start 4381; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4382; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4383; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 4384; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4385; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4386; GCN1-NEXT: buffer_wbinvl1_vol 4387; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4388; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4389; GCN1-NEXT: v_mov_b32_e32 v4, v3 4390; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 4391; GCN1-NEXT: s_cbranch_execnz .LBB94_1 4392; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4393; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 4394; GCN1-NEXT: s_setpc_b64 s[30:31] 4395; 4396; GCN2-LABEL: flat_atomic_umax_i32_noret: 4397; GCN2: ; %bb.0: 4398; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4399; GCN2-NEXT: flat_load_dword v4, v[0:1] 4400; GCN2-NEXT: s_mov_b64 s[4:5], 0 4401; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start 4402; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4403; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4404; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 4405; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4406; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4407; GCN2-NEXT: buffer_wbinvl1_vol 4408; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4409; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4410; GCN2-NEXT: v_mov_b32_e32 v4, v3 4411; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 4412; GCN2-NEXT: s_cbranch_execnz .LBB94_1 4413; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4414; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 4415; GCN2-NEXT: s_setpc_b64 s[30:31] 4416; 4417; GCN3-LABEL: flat_atomic_umax_i32_noret: 4418; GCN3: ; %bb.0: 4419; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4420; GCN3-NEXT: flat_load_dword v4, v[0:1] 4421; GCN3-NEXT: s_mov_b64 s[4:5], 0 4422; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start 4423; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4424; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4425; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 4426; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4427; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4428; GCN3-NEXT: buffer_wbinvl1_vol 4429; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4430; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4431; GCN3-NEXT: v_mov_b32_e32 v4, v3 4432; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 4433; GCN3-NEXT: s_cbranch_execnz .LBB94_1 4434; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4435; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 4436; GCN3-NEXT: s_setpc_b64 s[30:31] 4437 %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst 4438 ret void 4439} 4440 4441define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { 4442; GCN1-LABEL: flat_atomic_umax_i32_noret_offset: 4443; GCN1: ; %bb.0: 4444; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4445; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 4446; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4447; GCN1-NEXT: flat_load_dword v4, v[0:1] 4448; GCN1-NEXT: s_mov_b64 s[4:5], 0 4449; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start 4450; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4451; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4452; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 4453; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4454; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4455; GCN1-NEXT: buffer_wbinvl1_vol 4456; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4457; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4458; GCN1-NEXT: v_mov_b32_e32 v4, v3 4459; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 4460; GCN1-NEXT: s_cbranch_execnz .LBB95_1 4461; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4462; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 4463; GCN1-NEXT: s_setpc_b64 s[30:31] 4464; 4465; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: 4466; GCN2: ; %bb.0: 4467; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4468; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 4469; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4470; GCN2-NEXT: flat_load_dword v4, v[0:1] 4471; GCN2-NEXT: s_mov_b64 s[4:5], 0 4472; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start 4473; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4474; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4475; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 4476; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4477; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4478; GCN2-NEXT: buffer_wbinvl1_vol 4479; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4480; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4481; GCN2-NEXT: v_mov_b32_e32 v4, v3 4482; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 4483; GCN2-NEXT: s_cbranch_execnz .LBB95_1 4484; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4485; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 4486; GCN2-NEXT: s_setpc_b64 s[30:31] 4487; 4488; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: 4489; GCN3: ; %bb.0: 4490; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4491; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 4492; GCN3-NEXT: s_mov_b64 s[4:5], 0 4493; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start 4494; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4495; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4496; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 4497; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 4498; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4499; GCN3-NEXT: buffer_wbinvl1_vol 4500; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4501; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4502; GCN3-NEXT: v_mov_b32_e32 v4, v3 4503; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 4504; GCN3-NEXT: s_cbranch_execnz .LBB95_1 4505; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4506; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 4507; GCN3-NEXT: s_setpc_b64 s[30:31] 4508 %gep = getelementptr i32, ptr %out, i32 4 4509 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst 4510 ret void 4511} 4512 4513define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { 4514; GCN1-LABEL: flat_atomic_umax_i32_ret: 4515; GCN1: ; %bb.0: 4516; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4517; GCN1-NEXT: flat_load_dword v3, v[0:1] 4518; GCN1-NEXT: s_mov_b64 s[4:5], 0 4519; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start 4520; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4521; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4522; GCN1-NEXT: v_mov_b32_e32 v4, v3 4523; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 4524; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4525; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4526; GCN1-NEXT: buffer_wbinvl1_vol 4527; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4528; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4529; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 4530; GCN1-NEXT: s_cbranch_execnz .LBB96_1 4531; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4532; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 4533; GCN1-NEXT: v_mov_b32_e32 v0, v3 4534; GCN1-NEXT: s_setpc_b64 s[30:31] 4535; 4536; GCN2-LABEL: flat_atomic_umax_i32_ret: 4537; GCN2: ; %bb.0: 4538; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4539; GCN2-NEXT: flat_load_dword v3, v[0:1] 4540; GCN2-NEXT: s_mov_b64 s[4:5], 0 4541; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start 4542; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4543; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4544; GCN2-NEXT: v_mov_b32_e32 v4, v3 4545; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 4546; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4547; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4548; GCN2-NEXT: buffer_wbinvl1_vol 4549; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4550; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4551; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 4552; GCN2-NEXT: s_cbranch_execnz .LBB96_1 4553; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4554; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 4555; GCN2-NEXT: v_mov_b32_e32 v0, v3 4556; GCN2-NEXT: s_setpc_b64 s[30:31] 4557; 4558; GCN3-LABEL: flat_atomic_umax_i32_ret: 4559; GCN3: ; %bb.0: 4560; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4561; GCN3-NEXT: flat_load_dword v3, v[0:1] 4562; GCN3-NEXT: s_mov_b64 s[4:5], 0 4563; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start 4564; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4565; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4566; GCN3-NEXT: v_mov_b32_e32 v4, v3 4567; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 4568; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4569; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4570; GCN3-NEXT: buffer_wbinvl1_vol 4571; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4572; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4573; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 4574; GCN3-NEXT: s_cbranch_execnz .LBB96_1 4575; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4576; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 4577; GCN3-NEXT: v_mov_b32_e32 v0, v3 4578; GCN3-NEXT: s_setpc_b64 s[30:31] 4579 %result = atomicrmw umax ptr %ptr, i32 %in seq_cst 4580 ret i32 %result 4581} 4582 4583define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { 4584; GCN1-LABEL: flat_atomic_umax_i32_ret_offset: 4585; GCN1: ; %bb.0: 4586; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4587; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 4588; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4589; GCN1-NEXT: flat_load_dword v0, v[3:4] 4590; GCN1-NEXT: s_mov_b64 s[4:5], 0 4591; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start 4592; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4593; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4594; GCN1-NEXT: v_mov_b32_e32 v1, v0 4595; GCN1-NEXT: v_max_u32_e32 v0, v1, v2 4596; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4597; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4598; GCN1-NEXT: buffer_wbinvl1_vol 4599; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4600; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4601; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 4602; GCN1-NEXT: s_cbranch_execnz .LBB97_1 4603; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4604; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 4605; GCN1-NEXT: s_setpc_b64 s[30:31] 4606; 4607; GCN2-LABEL: flat_atomic_umax_i32_ret_offset: 4608; GCN2: ; %bb.0: 4609; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4610; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 4611; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4612; GCN2-NEXT: flat_load_dword v0, v[3:4] 4613; GCN2-NEXT: s_mov_b64 s[4:5], 0 4614; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start 4615; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4616; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4617; GCN2-NEXT: v_mov_b32_e32 v1, v0 4618; GCN2-NEXT: v_max_u32_e32 v0, v1, v2 4619; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4620; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4621; GCN2-NEXT: buffer_wbinvl1_vol 4622; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4623; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4624; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 4625; GCN2-NEXT: s_cbranch_execnz .LBB97_1 4626; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4627; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 4628; GCN2-NEXT: s_setpc_b64 s[30:31] 4629; 4630; GCN3-LABEL: flat_atomic_umax_i32_ret_offset: 4631; GCN3: ; %bb.0: 4632; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4633; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 4634; GCN3-NEXT: s_mov_b64 s[4:5], 0 4635; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start 4636; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4637; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4638; GCN3-NEXT: v_mov_b32_e32 v4, v3 4639; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 4640; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 4641; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4642; GCN3-NEXT: buffer_wbinvl1_vol 4643; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4644; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4645; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 4646; GCN3-NEXT: s_cbranch_execnz .LBB97_1 4647; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4648; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 4649; GCN3-NEXT: v_mov_b32_e32 v0, v3 4650; GCN3-NEXT: s_setpc_b64 s[30:31] 4651 %gep = getelementptr i32, ptr %out, i32 4 4652 %result = atomicrmw umax ptr %gep, i32 %in seq_cst 4653 ret i32 %result 4654} 4655 4656define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 4657; GCN1-LABEL: flat_atomic_umax_i32_noret_scalar: 4658; GCN1: ; %bb.0: 4659; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4660; GCN1-NEXT: v_mov_b32_e32 v0, s4 4661; GCN1-NEXT: v_mov_b32_e32 v1, s5 4662; GCN1-NEXT: flat_load_dword v3, v[0:1] 4663; GCN1-NEXT: s_mov_b64 s[34:35], 0 4664; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start 4665; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4666; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4667; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 4668; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4669; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4670; GCN1-NEXT: buffer_wbinvl1_vol 4671; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4672; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4673; GCN1-NEXT: v_mov_b32_e32 v3, v2 4674; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 4675; GCN1-NEXT: s_cbranch_execnz .LBB98_1 4676; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4677; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 4678; GCN1-NEXT: s_setpc_b64 s[30:31] 4679; 4680; GCN2-LABEL: flat_atomic_umax_i32_noret_scalar: 4681; GCN2: ; %bb.0: 4682; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4683; GCN2-NEXT: v_mov_b32_e32 v0, s4 4684; GCN2-NEXT: v_mov_b32_e32 v1, s5 4685; GCN2-NEXT: flat_load_dword v3, v[0:1] 4686; GCN2-NEXT: s_mov_b64 s[34:35], 0 4687; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start 4688; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4689; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4690; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 4691; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4692; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4693; GCN2-NEXT: buffer_wbinvl1_vol 4694; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4695; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4696; GCN2-NEXT: v_mov_b32_e32 v3, v2 4697; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 4698; GCN2-NEXT: s_cbranch_execnz .LBB98_1 4699; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4700; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 4701; GCN2-NEXT: s_setpc_b64 s[30:31] 4702; 4703; GCN3-LABEL: flat_atomic_umax_i32_noret_scalar: 4704; GCN3: ; %bb.0: 4705; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4706; GCN3-NEXT: v_mov_b32_e32 v0, s4 4707; GCN3-NEXT: v_mov_b32_e32 v1, s5 4708; GCN3-NEXT: flat_load_dword v3, v[0:1] 4709; GCN3-NEXT: s_mov_b64 s[34:35], 0 4710; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start 4711; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4712; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4713; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 4714; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4715; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4716; GCN3-NEXT: buffer_wbinvl1_vol 4717; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4718; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4719; GCN3-NEXT: v_mov_b32_e32 v3, v2 4720; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 4721; GCN3-NEXT: s_cbranch_execnz .LBB98_1 4722; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4723; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 4724; GCN3-NEXT: s_setpc_b64 s[30:31] 4725 %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst 4726 ret void 4727} 4728 4729define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 4730; GCN1-LABEL: flat_atomic_umax_i32_noret_offset_scalar: 4731; GCN1: ; %bb.0: 4732; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4733; GCN1-NEXT: s_add_u32 s34, s4, 16 4734; GCN1-NEXT: s_addc_u32 s35, s5, 0 4735; GCN1-NEXT: v_mov_b32_e32 v0, s34 4736; GCN1-NEXT: v_mov_b32_e32 v1, s35 4737; GCN1-NEXT: flat_load_dword v3, v[0:1] 4738; GCN1-NEXT: s_mov_b64 s[34:35], 0 4739; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start 4740; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4741; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4742; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 4743; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4744; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4745; GCN1-NEXT: buffer_wbinvl1_vol 4746; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4747; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4748; GCN1-NEXT: v_mov_b32_e32 v3, v2 4749; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 4750; GCN1-NEXT: s_cbranch_execnz .LBB99_1 4751; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4752; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 4753; GCN1-NEXT: s_setpc_b64 s[30:31] 4754; 4755; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar: 4756; GCN2: ; %bb.0: 4757; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4758; GCN2-NEXT: s_add_u32 s34, s4, 16 4759; GCN2-NEXT: s_addc_u32 s35, s5, 0 4760; GCN2-NEXT: v_mov_b32_e32 v0, s34 4761; GCN2-NEXT: v_mov_b32_e32 v1, s35 4762; GCN2-NEXT: flat_load_dword v3, v[0:1] 4763; GCN2-NEXT: s_mov_b64 s[34:35], 0 4764; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start 4765; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4766; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4767; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 4768; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4769; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4770; GCN2-NEXT: buffer_wbinvl1_vol 4771; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4772; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4773; GCN2-NEXT: v_mov_b32_e32 v3, v2 4774; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 4775; GCN2-NEXT: s_cbranch_execnz .LBB99_1 4776; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4777; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 4778; GCN2-NEXT: s_setpc_b64 s[30:31] 4779; 4780; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar: 4781; GCN3: ; %bb.0: 4782; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4783; GCN3-NEXT: v_mov_b32_e32 v0, s4 4784; GCN3-NEXT: v_mov_b32_e32 v1, s5 4785; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 4786; GCN3-NEXT: s_mov_b64 s[34:35], 0 4787; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start 4788; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4789; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4790; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 4791; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4792; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4793; GCN3-NEXT: buffer_wbinvl1_vol 4794; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4795; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4796; GCN3-NEXT: v_mov_b32_e32 v3, v2 4797; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 4798; GCN3-NEXT: s_cbranch_execnz .LBB99_1 4799; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4800; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 4801; GCN3-NEXT: s_setpc_b64 s[30:31] 4802 %gep = getelementptr i32, ptr %out, i32 4 4803 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst 4804 ret void 4805} 4806 4807define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 4808; GCN1-LABEL: flat_atomic_umax_i32_ret_scalar: 4809; GCN1: ; %bb.0: 4810; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4811; GCN1-NEXT: v_mov_b32_e32 v0, s4 4812; GCN1-NEXT: v_mov_b32_e32 v1, s5 4813; GCN1-NEXT: flat_load_dword v0, v[0:1] 4814; GCN1-NEXT: v_mov_b32_e32 v1, s4 4815; GCN1-NEXT: s_mov_b64 s[34:35], 0 4816; GCN1-NEXT: v_mov_b32_e32 v2, s5 4817; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start 4818; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4819; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4820; GCN1-NEXT: v_mov_b32_e32 v4, v0 4821; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 4822; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4823; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4824; GCN1-NEXT: buffer_wbinvl1_vol 4825; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4826; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4827; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 4828; GCN1-NEXT: s_cbranch_execnz .LBB100_1 4829; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4830; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 4831; GCN1-NEXT: s_setpc_b64 s[30:31] 4832; 4833; GCN2-LABEL: flat_atomic_umax_i32_ret_scalar: 4834; GCN2: ; %bb.0: 4835; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4836; GCN2-NEXT: v_mov_b32_e32 v0, s4 4837; GCN2-NEXT: v_mov_b32_e32 v1, s5 4838; GCN2-NEXT: flat_load_dword v0, v[0:1] 4839; GCN2-NEXT: v_mov_b32_e32 v1, s4 4840; GCN2-NEXT: s_mov_b64 s[34:35], 0 4841; GCN2-NEXT: v_mov_b32_e32 v2, s5 4842; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start 4843; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4844; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4845; GCN2-NEXT: v_mov_b32_e32 v4, v0 4846; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 4847; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4848; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4849; GCN2-NEXT: buffer_wbinvl1_vol 4850; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4851; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4852; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 4853; GCN2-NEXT: s_cbranch_execnz .LBB100_1 4854; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4855; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 4856; GCN2-NEXT: s_setpc_b64 s[30:31] 4857; 4858; GCN3-LABEL: flat_atomic_umax_i32_ret_scalar: 4859; GCN3: ; %bb.0: 4860; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4861; GCN3-NEXT: v_mov_b32_e32 v0, s4 4862; GCN3-NEXT: v_mov_b32_e32 v1, s5 4863; GCN3-NEXT: flat_load_dword v0, v[0:1] 4864; GCN3-NEXT: v_mov_b32_e32 v1, s4 4865; GCN3-NEXT: s_mov_b64 s[34:35], 0 4866; GCN3-NEXT: v_mov_b32_e32 v2, s5 4867; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start 4868; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4869; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4870; GCN3-NEXT: v_mov_b32_e32 v4, v0 4871; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 4872; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4873; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4874; GCN3-NEXT: buffer_wbinvl1_vol 4875; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4876; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4877; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 4878; GCN3-NEXT: s_cbranch_execnz .LBB100_1 4879; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4880; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 4881; GCN3-NEXT: s_setpc_b64 s[30:31] 4882 %result = atomicrmw umax ptr %ptr, i32 %in seq_cst 4883 ret i32 %result 4884} 4885 4886define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 4887; GCN1-LABEL: flat_atomic_umax_i32_ret_offset_scalar: 4888; GCN1: ; %bb.0: 4889; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4890; GCN1-NEXT: s_add_u32 s34, s4, 16 4891; GCN1-NEXT: s_addc_u32 s35, s5, 0 4892; GCN1-NEXT: v_mov_b32_e32 v1, s34 4893; GCN1-NEXT: v_mov_b32_e32 v2, s35 4894; GCN1-NEXT: flat_load_dword v0, v[1:2] 4895; GCN1-NEXT: s_mov_b64 s[34:35], 0 4896; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start 4897; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4898; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4899; GCN1-NEXT: v_mov_b32_e32 v4, v0 4900; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 4901; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4902; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4903; GCN1-NEXT: buffer_wbinvl1_vol 4904; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4905; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4906; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 4907; GCN1-NEXT: s_cbranch_execnz .LBB101_1 4908; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4909; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 4910; GCN1-NEXT: s_setpc_b64 s[30:31] 4911; 4912; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar: 4913; GCN2: ; %bb.0: 4914; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4915; GCN2-NEXT: s_add_u32 s34, s4, 16 4916; GCN2-NEXT: s_addc_u32 s35, s5, 0 4917; GCN2-NEXT: v_mov_b32_e32 v1, s34 4918; GCN2-NEXT: v_mov_b32_e32 v2, s35 4919; GCN2-NEXT: flat_load_dword v0, v[1:2] 4920; GCN2-NEXT: s_mov_b64 s[34:35], 0 4921; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start 4922; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 4923; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4924; GCN2-NEXT: v_mov_b32_e32 v4, v0 4925; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 4926; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 4927; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4928; GCN2-NEXT: buffer_wbinvl1_vol 4929; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4930; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4931; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 4932; GCN2-NEXT: s_cbranch_execnz .LBB101_1 4933; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 4934; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 4935; GCN2-NEXT: s_setpc_b64 s[30:31] 4936; 4937; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar: 4938; GCN3: ; %bb.0: 4939; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4940; GCN3-NEXT: v_mov_b32_e32 v0, s4 4941; GCN3-NEXT: v_mov_b32_e32 v1, s5 4942; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 4943; GCN3-NEXT: v_mov_b32_e32 v1, s4 4944; GCN3-NEXT: s_mov_b64 s[34:35], 0 4945; GCN3-NEXT: v_mov_b32_e32 v2, s5 4946; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start 4947; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 4948; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4949; GCN3-NEXT: v_mov_b32_e32 v4, v0 4950; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 4951; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc 4952; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4953; GCN3-NEXT: buffer_wbinvl1_vol 4954; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 4955; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 4956; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 4957; GCN3-NEXT: s_cbranch_execnz .LBB101_1 4958; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 4959; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 4960; GCN3-NEXT: s_setpc_b64 s[30:31] 4961 %gep = getelementptr i32, ptr %out, i32 4 4962 %result = atomicrmw umax ptr %gep, i32 %in seq_cst 4963 ret i32 %result 4964} 4965 4966define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { 4967; GCN1-LABEL: atomic_umax_i32_addr64_offset: 4968; GCN1: ; %bb.0: ; %entry 4969; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4970; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4971; GCN1-NEXT: s_ashr_i32 s5, s3, 31 4972; GCN1-NEXT: s_mov_b32 s4, s3 4973; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 4974; GCN1-NEXT: s_add_u32 s0, s0, s4 4975; GCN1-NEXT: s_addc_u32 s1, s1, s5 4976; GCN1-NEXT: s_add_u32 s0, s0, 16 4977; GCN1-NEXT: s_addc_u32 s1, s1, 0 4978; GCN1-NEXT: v_mov_b32_e32 v0, s0 4979; GCN1-NEXT: v_mov_b32_e32 v1, s1 4980; GCN1-NEXT: flat_load_dword v3, v[0:1] 4981; GCN1-NEXT: s_mov_b64 s[0:1], 0 4982; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start 4983; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 4984; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4985; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 4986; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4987; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4988; GCN1-NEXT: buffer_wbinvl1_vol 4989; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 4990; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 4991; GCN1-NEXT: v_mov_b32_e32 v3, v2 4992; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 4993; GCN1-NEXT: s_cbranch_execnz .LBB102_1 4994; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 4995; GCN1-NEXT: s_endpgm 4996; 4997; GCN2-LABEL: atomic_umax_i32_addr64_offset: 4998; GCN2: ; %bb.0: ; %entry 4999; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5000; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5001; GCN2-NEXT: s_ashr_i32 s5, s3, 31 5002; GCN2-NEXT: s_mov_b32 s4, s3 5003; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5004; GCN2-NEXT: s_add_u32 s0, s0, s4 5005; GCN2-NEXT: s_addc_u32 s1, s1, s5 5006; GCN2-NEXT: s_add_u32 s0, s0, 16 5007; GCN2-NEXT: s_addc_u32 s1, s1, 0 5008; GCN2-NEXT: v_mov_b32_e32 v0, s0 5009; GCN2-NEXT: v_mov_b32_e32 v1, s1 5010; GCN2-NEXT: flat_load_dword v3, v[0:1] 5011; GCN2-NEXT: s_mov_b64 s[0:1], 0 5012; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start 5013; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5014; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5015; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 5016; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5017; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5018; GCN2-NEXT: buffer_wbinvl1_vol 5019; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5020; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5021; GCN2-NEXT: v_mov_b32_e32 v3, v2 5022; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 5023; GCN2-NEXT: s_cbranch_execnz .LBB102_1 5024; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5025; GCN2-NEXT: s_endpgm 5026; 5027; GCN3-LABEL: atomic_umax_i32_addr64_offset: 5028; GCN3: ; %bb.0: ; %entry 5029; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5030; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5031; GCN3-NEXT: s_ashr_i32 s5, s3, 31 5032; GCN3-NEXT: s_mov_b32 s4, s3 5033; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5034; GCN3-NEXT: s_add_u32 s0, s0, s4 5035; GCN3-NEXT: s_addc_u32 s1, s1, s5 5036; GCN3-NEXT: v_mov_b32_e32 v0, s0 5037; GCN3-NEXT: v_mov_b32_e32 v1, s1 5038; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 5039; GCN3-NEXT: s_mov_b64 s[0:1], 0 5040; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start 5041; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5042; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5043; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 5044; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5045; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5046; GCN3-NEXT: buffer_wbinvl1_vol 5047; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5048; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5049; GCN3-NEXT: v_mov_b32_e32 v3, v2 5050; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 5051; GCN3-NEXT: s_cbranch_execnz .LBB102_1 5052; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5053; GCN3-NEXT: s_endpgm 5054entry: 5055 %ptr = getelementptr i32, ptr %out, i32 %index 5056 %gep = getelementptr i32, ptr %ptr, i32 4 5057 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst 5058 ret void 5059} 5060 5061define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { 5062; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: 5063; GCN1: ; %bb.0: ; %entry 5064; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5065; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5066; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5067; GCN1-NEXT: s_ashr_i32 s5, s7, 31 5068; GCN1-NEXT: s_mov_b32 s4, s7 5069; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5070; GCN1-NEXT: s_add_u32 s0, s0, s4 5071; GCN1-NEXT: s_addc_u32 s1, s1, s5 5072; GCN1-NEXT: s_add_u32 s0, s0, 16 5073; GCN1-NEXT: s_addc_u32 s1, s1, 0 5074; GCN1-NEXT: v_mov_b32_e32 v0, s0 5075; GCN1-NEXT: v_mov_b32_e32 v1, s1 5076; GCN1-NEXT: flat_load_dword v2, v[0:1] 5077; GCN1-NEXT: s_mov_b64 s[0:1], 0 5078; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start 5079; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5080; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5081; GCN1-NEXT: v_mov_b32_e32 v3, v2 5082; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 5083; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5084; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5085; GCN1-NEXT: buffer_wbinvl1_vol 5086; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5087; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5088; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 5089; GCN1-NEXT: s_cbranch_execnz .LBB103_1 5090; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5091; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] 5092; GCN1-NEXT: v_mov_b32_e32 v0, s2 5093; GCN1-NEXT: v_mov_b32_e32 v1, s3 5094; GCN1-NEXT: flat_store_dword v[0:1], v2 5095; GCN1-NEXT: s_endpgm 5096; 5097; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: 5098; GCN2: ; %bb.0: ; %entry 5099; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5100; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5101; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5102; GCN2-NEXT: s_ashr_i32 s5, s7, 31 5103; GCN2-NEXT: s_mov_b32 s4, s7 5104; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5105; GCN2-NEXT: s_add_u32 s0, s0, s4 5106; GCN2-NEXT: s_addc_u32 s1, s1, s5 5107; GCN2-NEXT: s_add_u32 s0, s0, 16 5108; GCN2-NEXT: s_addc_u32 s1, s1, 0 5109; GCN2-NEXT: v_mov_b32_e32 v0, s0 5110; GCN2-NEXT: v_mov_b32_e32 v1, s1 5111; GCN2-NEXT: flat_load_dword v2, v[0:1] 5112; GCN2-NEXT: s_mov_b64 s[0:1], 0 5113; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start 5114; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5115; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5116; GCN2-NEXT: v_mov_b32_e32 v3, v2 5117; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 5118; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5119; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5120; GCN2-NEXT: buffer_wbinvl1_vol 5121; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5122; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5123; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 5124; GCN2-NEXT: s_cbranch_execnz .LBB103_1 5125; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5126; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] 5127; GCN2-NEXT: v_mov_b32_e32 v0, s2 5128; GCN2-NEXT: v_mov_b32_e32 v1, s3 5129; GCN2-NEXT: flat_store_dword v[0:1], v2 5130; GCN2-NEXT: s_endpgm 5131; 5132; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: 5133; GCN3: ; %bb.0: ; %entry 5134; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5135; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5136; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5137; GCN3-NEXT: s_ashr_i32 s5, s7, 31 5138; GCN3-NEXT: s_mov_b32 s4, s7 5139; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5140; GCN3-NEXT: s_add_u32 s0, s0, s4 5141; GCN3-NEXT: s_addc_u32 s1, s1, s5 5142; GCN3-NEXT: v_mov_b32_e32 v0, s0 5143; GCN3-NEXT: v_mov_b32_e32 v1, s1 5144; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 5145; GCN3-NEXT: s_mov_b64 s[0:1], 0 5146; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start 5147; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5148; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5149; GCN3-NEXT: v_mov_b32_e32 v3, v2 5150; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 5151; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5152; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5153; GCN3-NEXT: buffer_wbinvl1_vol 5154; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5155; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5156; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 5157; GCN3-NEXT: s_cbranch_execnz .LBB103_1 5158; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5159; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] 5160; GCN3-NEXT: v_mov_b32_e32 v0, s2 5161; GCN3-NEXT: v_mov_b32_e32 v1, s3 5162; GCN3-NEXT: flat_store_dword v[0:1], v2 5163; GCN3-NEXT: s_endpgm 5164entry: 5165 %ptr = getelementptr i32, ptr %out, i32 %index 5166 %gep = getelementptr i32, ptr %ptr, i32 4 5167 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst 5168 store i32 %tmp0, ptr %out2 5169 ret void 5170} 5171 5172define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { 5173; GCN1-LABEL: atomic_umax_i32_ret_addr64: 5174; GCN1: ; %bb.0: ; %entry 5175; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5176; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5177; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5178; GCN1-NEXT: s_ashr_i32 s5, s7, 31 5179; GCN1-NEXT: s_mov_b32 s4, s7 5180; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5181; GCN1-NEXT: s_add_u32 s0, s0, s4 5182; GCN1-NEXT: s_addc_u32 s1, s1, s5 5183; GCN1-NEXT: v_mov_b32_e32 v0, s0 5184; GCN1-NEXT: v_mov_b32_e32 v1, s1 5185; GCN1-NEXT: flat_load_dword v2, v[0:1] 5186; GCN1-NEXT: s_mov_b64 s[0:1], 0 5187; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start 5188; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5189; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5190; GCN1-NEXT: v_mov_b32_e32 v3, v2 5191; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 5192; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5193; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5194; GCN1-NEXT: buffer_wbinvl1_vol 5195; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5196; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5197; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 5198; GCN1-NEXT: s_cbranch_execnz .LBB104_1 5199; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5200; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] 5201; GCN1-NEXT: v_mov_b32_e32 v0, s2 5202; GCN1-NEXT: v_mov_b32_e32 v1, s3 5203; GCN1-NEXT: flat_store_dword v[0:1], v2 5204; GCN1-NEXT: s_endpgm 5205; 5206; GCN2-LABEL: atomic_umax_i32_ret_addr64: 5207; GCN2: ; %bb.0: ; %entry 5208; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5209; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5210; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5211; GCN2-NEXT: s_ashr_i32 s5, s7, 31 5212; GCN2-NEXT: s_mov_b32 s4, s7 5213; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5214; GCN2-NEXT: s_add_u32 s0, s0, s4 5215; GCN2-NEXT: s_addc_u32 s1, s1, s5 5216; GCN2-NEXT: v_mov_b32_e32 v0, s0 5217; GCN2-NEXT: v_mov_b32_e32 v1, s1 5218; GCN2-NEXT: flat_load_dword v2, v[0:1] 5219; GCN2-NEXT: s_mov_b64 s[0:1], 0 5220; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start 5221; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5222; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5223; GCN2-NEXT: v_mov_b32_e32 v3, v2 5224; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 5225; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5226; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5227; GCN2-NEXT: buffer_wbinvl1_vol 5228; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5229; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5230; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 5231; GCN2-NEXT: s_cbranch_execnz .LBB104_1 5232; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5233; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] 5234; GCN2-NEXT: v_mov_b32_e32 v0, s2 5235; GCN2-NEXT: v_mov_b32_e32 v1, s3 5236; GCN2-NEXT: flat_store_dword v[0:1], v2 5237; GCN2-NEXT: s_endpgm 5238; 5239; GCN3-LABEL: atomic_umax_i32_ret_addr64: 5240; GCN3: ; %bb.0: ; %entry 5241; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5242; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5243; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5244; GCN3-NEXT: s_ashr_i32 s5, s7, 31 5245; GCN3-NEXT: s_mov_b32 s4, s7 5246; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 5247; GCN3-NEXT: s_add_u32 s0, s0, s4 5248; GCN3-NEXT: s_addc_u32 s1, s1, s5 5249; GCN3-NEXT: v_mov_b32_e32 v0, s0 5250; GCN3-NEXT: v_mov_b32_e32 v1, s1 5251; GCN3-NEXT: flat_load_dword v2, v[0:1] 5252; GCN3-NEXT: s_mov_b64 s[0:1], 0 5253; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start 5254; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5255; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5256; GCN3-NEXT: v_mov_b32_e32 v3, v2 5257; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 5258; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5259; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5260; GCN3-NEXT: buffer_wbinvl1_vol 5261; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5262; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5263; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 5264; GCN3-NEXT: s_cbranch_execnz .LBB104_1 5265; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5266; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] 5267; GCN3-NEXT: v_mov_b32_e32 v0, s2 5268; GCN3-NEXT: v_mov_b32_e32 v1, s3 5269; GCN3-NEXT: flat_store_dword v[0:1], v2 5270; GCN3-NEXT: s_endpgm 5271entry: 5272 %ptr = getelementptr i32, ptr %out, i32 %index 5273 %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst 5274 store i32 %tmp0, ptr %out2 5275 ret void 5276} 5277 5278define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 5279; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: 5280; GCN1: ; %bb.0: 5281; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5282; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 5283; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5284; GCN1-NEXT: flat_load_dword v4, v[0:1] 5285; GCN1-NEXT: s_mov_b64 s[4:5], 0 5286; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start 5287; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5288; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5289; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 5290; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5291; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5292; GCN1-NEXT: buffer_wbinvl1_vol 5293; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5294; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5295; GCN1-NEXT: v_mov_b32_e32 v4, v3 5296; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 5297; GCN1-NEXT: s_cbranch_execnz .LBB105_1 5298; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5299; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 5300; GCN1-NEXT: s_setpc_b64 s[30:31] 5301; 5302; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: 5303; GCN2: ; %bb.0: 5304; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5305; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 5306; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5307; GCN2-NEXT: flat_load_dword v4, v[0:1] 5308; GCN2-NEXT: s_mov_b64 s[4:5], 0 5309; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start 5310; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5311; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5312; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 5313; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5314; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5315; GCN2-NEXT: buffer_wbinvl1_vol 5316; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5317; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5318; GCN2-NEXT: v_mov_b32_e32 v4, v3 5319; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 5320; GCN2-NEXT: s_cbranch_execnz .LBB105_1 5321; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5322; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 5323; GCN2-NEXT: s_setpc_b64 s[30:31] 5324; 5325; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: 5326; GCN3: ; %bb.0: 5327; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5328; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 5329; GCN3-NEXT: s_mov_b64 s[4:5], 0 5330; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start 5331; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5332; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5333; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 5334; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 5335; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5336; GCN3-NEXT: buffer_wbinvl1_vol 5337; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5338; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5339; GCN3-NEXT: v_mov_b32_e32 v4, v3 5340; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 5341; GCN3-NEXT: s_cbranch_execnz .LBB105_1 5342; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5343; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 5344; GCN3-NEXT: s_setpc_b64 s[30:31] 5345 %gep = getelementptr i32, ptr %out, i64 4 5346 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 5347 ret void 5348} 5349 5350define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 5351; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: 5352; GCN1: ; %bb.0: 5353; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5354; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 5355; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5356; GCN1-NEXT: flat_load_dword v0, v[3:4] 5357; GCN1-NEXT: s_mov_b64 s[4:5], 0 5358; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start 5359; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5360; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5361; GCN1-NEXT: v_mov_b32_e32 v1, v0 5362; GCN1-NEXT: v_max_u32_e32 v0, v1, v2 5363; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 5364; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5365; GCN1-NEXT: buffer_wbinvl1_vol 5366; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5367; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5368; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 5369; GCN1-NEXT: s_cbranch_execnz .LBB106_1 5370; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5371; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 5372; GCN1-NEXT: s_setpc_b64 s[30:31] 5373; 5374; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: 5375; GCN2: ; %bb.0: 5376; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5377; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 5378; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5379; GCN2-NEXT: flat_load_dword v0, v[3:4] 5380; GCN2-NEXT: s_mov_b64 s[4:5], 0 5381; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start 5382; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5383; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5384; GCN2-NEXT: v_mov_b32_e32 v1, v0 5385; GCN2-NEXT: v_max_u32_e32 v0, v1, v2 5386; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 5387; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5388; GCN2-NEXT: buffer_wbinvl1_vol 5389; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5390; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5391; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 5392; GCN2-NEXT: s_cbranch_execnz .LBB106_1 5393; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5394; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 5395; GCN2-NEXT: s_setpc_b64 s[30:31] 5396; 5397; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: 5398; GCN3: ; %bb.0: 5399; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5400; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 5401; GCN3-NEXT: s_mov_b64 s[4:5], 0 5402; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start 5403; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5404; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5405; GCN3-NEXT: v_mov_b32_e32 v4, v3 5406; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 5407; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 5408; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5409; GCN3-NEXT: buffer_wbinvl1_vol 5410; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5411; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5412; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 5413; GCN3-NEXT: s_cbranch_execnz .LBB106_1 5414; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5415; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 5416; GCN3-NEXT: v_mov_b32_e32 v0, v3 5417; GCN3-NEXT: s_setpc_b64 s[30:31] 5418 %gep = getelementptr i32, ptr %out, i64 4 5419 %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 5420 ret i32 %result 5421} 5422 5423; --------------------------------------------------------------------- 5424; atomicrmw umin 5425; --------------------------------------------------------------------- 5426 5427define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { 5428; GCN1-LABEL: flat_atomic_umin_i32_noret: 5429; GCN1: ; %bb.0: 5430; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5431; GCN1-NEXT: flat_load_dword v4, v[0:1] 5432; GCN1-NEXT: s_mov_b64 s[4:5], 0 5433; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start 5434; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5435; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5436; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 5437; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5438; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5439; GCN1-NEXT: buffer_wbinvl1_vol 5440; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5441; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5442; GCN1-NEXT: v_mov_b32_e32 v4, v3 5443; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 5444; GCN1-NEXT: s_cbranch_execnz .LBB107_1 5445; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5446; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 5447; GCN1-NEXT: s_setpc_b64 s[30:31] 5448; 5449; GCN2-LABEL: flat_atomic_umin_i32_noret: 5450; GCN2: ; %bb.0: 5451; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5452; GCN2-NEXT: flat_load_dword v4, v[0:1] 5453; GCN2-NEXT: s_mov_b64 s[4:5], 0 5454; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start 5455; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5456; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5457; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 5458; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5459; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5460; GCN2-NEXT: buffer_wbinvl1_vol 5461; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5462; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5463; GCN2-NEXT: v_mov_b32_e32 v4, v3 5464; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 5465; GCN2-NEXT: s_cbranch_execnz .LBB107_1 5466; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5467; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 5468; GCN2-NEXT: s_setpc_b64 s[30:31] 5469; 5470; GCN3-LABEL: flat_atomic_umin_i32_noret: 5471; GCN3: ; %bb.0: 5472; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5473; GCN3-NEXT: flat_load_dword v4, v[0:1] 5474; GCN3-NEXT: s_mov_b64 s[4:5], 0 5475; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start 5476; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5477; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5478; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 5479; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5480; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5481; GCN3-NEXT: buffer_wbinvl1_vol 5482; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5483; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5484; GCN3-NEXT: v_mov_b32_e32 v4, v3 5485; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 5486; GCN3-NEXT: s_cbranch_execnz .LBB107_1 5487; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5488; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 5489; GCN3-NEXT: s_setpc_b64 s[30:31] 5490 %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst 5491 ret void 5492} 5493 5494define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { 5495; GCN1-LABEL: flat_atomic_umin_i32_noret_offset: 5496; GCN1: ; %bb.0: 5497; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5498; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 5499; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5500; GCN1-NEXT: flat_load_dword v4, v[0:1] 5501; GCN1-NEXT: s_mov_b64 s[4:5], 0 5502; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start 5503; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5504; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5505; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 5506; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5507; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5508; GCN1-NEXT: buffer_wbinvl1_vol 5509; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5510; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5511; GCN1-NEXT: v_mov_b32_e32 v4, v3 5512; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 5513; GCN1-NEXT: s_cbranch_execnz .LBB108_1 5514; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5515; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 5516; GCN1-NEXT: s_setpc_b64 s[30:31] 5517; 5518; GCN2-LABEL: flat_atomic_umin_i32_noret_offset: 5519; GCN2: ; %bb.0: 5520; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5521; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 5522; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5523; GCN2-NEXT: flat_load_dword v4, v[0:1] 5524; GCN2-NEXT: s_mov_b64 s[4:5], 0 5525; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start 5526; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5527; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5528; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 5529; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5530; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5531; GCN2-NEXT: buffer_wbinvl1_vol 5532; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5533; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5534; GCN2-NEXT: v_mov_b32_e32 v4, v3 5535; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 5536; GCN2-NEXT: s_cbranch_execnz .LBB108_1 5537; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5538; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 5539; GCN2-NEXT: s_setpc_b64 s[30:31] 5540; 5541; GCN3-LABEL: flat_atomic_umin_i32_noret_offset: 5542; GCN3: ; %bb.0: 5543; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5544; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 5545; GCN3-NEXT: s_mov_b64 s[4:5], 0 5546; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start 5547; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5548; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5549; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 5550; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 5551; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5552; GCN3-NEXT: buffer_wbinvl1_vol 5553; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5554; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5555; GCN3-NEXT: v_mov_b32_e32 v4, v3 5556; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 5557; GCN3-NEXT: s_cbranch_execnz .LBB108_1 5558; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5559; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 5560; GCN3-NEXT: s_setpc_b64 s[30:31] 5561 %gep = getelementptr i32, ptr %out, i32 4 5562 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst 5563 ret void 5564} 5565 5566define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { 5567; GCN1-LABEL: flat_atomic_umin_i32_ret: 5568; GCN1: ; %bb.0: 5569; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5570; GCN1-NEXT: flat_load_dword v3, v[0:1] 5571; GCN1-NEXT: s_mov_b64 s[4:5], 0 5572; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start 5573; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5574; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5575; GCN1-NEXT: v_mov_b32_e32 v4, v3 5576; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 5577; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5578; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5579; GCN1-NEXT: buffer_wbinvl1_vol 5580; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5581; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5582; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 5583; GCN1-NEXT: s_cbranch_execnz .LBB109_1 5584; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5585; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 5586; GCN1-NEXT: v_mov_b32_e32 v0, v3 5587; GCN1-NEXT: s_setpc_b64 s[30:31] 5588; 5589; GCN2-LABEL: flat_atomic_umin_i32_ret: 5590; GCN2: ; %bb.0: 5591; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5592; GCN2-NEXT: flat_load_dword v3, v[0:1] 5593; GCN2-NEXT: s_mov_b64 s[4:5], 0 5594; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start 5595; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5596; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5597; GCN2-NEXT: v_mov_b32_e32 v4, v3 5598; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 5599; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5600; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5601; GCN2-NEXT: buffer_wbinvl1_vol 5602; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5603; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5604; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 5605; GCN2-NEXT: s_cbranch_execnz .LBB109_1 5606; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5607; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 5608; GCN2-NEXT: v_mov_b32_e32 v0, v3 5609; GCN2-NEXT: s_setpc_b64 s[30:31] 5610; 5611; GCN3-LABEL: flat_atomic_umin_i32_ret: 5612; GCN3: ; %bb.0: 5613; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5614; GCN3-NEXT: flat_load_dword v3, v[0:1] 5615; GCN3-NEXT: s_mov_b64 s[4:5], 0 5616; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start 5617; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5618; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5619; GCN3-NEXT: v_mov_b32_e32 v4, v3 5620; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 5621; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5622; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5623; GCN3-NEXT: buffer_wbinvl1_vol 5624; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5625; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5626; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 5627; GCN3-NEXT: s_cbranch_execnz .LBB109_1 5628; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5629; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 5630; GCN3-NEXT: v_mov_b32_e32 v0, v3 5631; GCN3-NEXT: s_setpc_b64 s[30:31] 5632 %result = atomicrmw umin ptr %ptr, i32 %in seq_cst 5633 ret i32 %result 5634} 5635 5636define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { 5637; GCN1-LABEL: flat_atomic_umin_i32_ret_offset: 5638; GCN1: ; %bb.0: 5639; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5640; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 5641; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5642; GCN1-NEXT: flat_load_dword v0, v[3:4] 5643; GCN1-NEXT: s_mov_b64 s[4:5], 0 5644; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start 5645; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5646; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5647; GCN1-NEXT: v_mov_b32_e32 v1, v0 5648; GCN1-NEXT: v_min_u32_e32 v0, v1, v2 5649; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 5650; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5651; GCN1-NEXT: buffer_wbinvl1_vol 5652; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5653; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5654; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 5655; GCN1-NEXT: s_cbranch_execnz .LBB110_1 5656; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5657; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 5658; GCN1-NEXT: s_setpc_b64 s[30:31] 5659; 5660; GCN2-LABEL: flat_atomic_umin_i32_ret_offset: 5661; GCN2: ; %bb.0: 5662; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5663; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 5664; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5665; GCN2-NEXT: flat_load_dword v0, v[3:4] 5666; GCN2-NEXT: s_mov_b64 s[4:5], 0 5667; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start 5668; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5669; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5670; GCN2-NEXT: v_mov_b32_e32 v1, v0 5671; GCN2-NEXT: v_min_u32_e32 v0, v1, v2 5672; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 5673; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5674; GCN2-NEXT: buffer_wbinvl1_vol 5675; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 5676; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5677; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 5678; GCN2-NEXT: s_cbranch_execnz .LBB110_1 5679; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5680; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 5681; GCN2-NEXT: s_setpc_b64 s[30:31] 5682; 5683; GCN3-LABEL: flat_atomic_umin_i32_ret_offset: 5684; GCN3: ; %bb.0: 5685; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5686; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 5687; GCN3-NEXT: s_mov_b64 s[4:5], 0 5688; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start 5689; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5690; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5691; GCN3-NEXT: v_mov_b32_e32 v4, v3 5692; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 5693; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 5694; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5695; GCN3-NEXT: buffer_wbinvl1_vol 5696; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5697; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5698; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 5699; GCN3-NEXT: s_cbranch_execnz .LBB110_1 5700; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5701; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 5702; GCN3-NEXT: v_mov_b32_e32 v0, v3 5703; GCN3-NEXT: s_setpc_b64 s[30:31] 5704 %gep = getelementptr i32, ptr %out, i32 4 5705 %result = atomicrmw umin ptr %gep, i32 %in seq_cst 5706 ret i32 %result 5707} 5708 5709define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 5710; GCN1-LABEL: flat_atomic_umin_i32_noret_scalar: 5711; GCN1: ; %bb.0: 5712; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5713; GCN1-NEXT: v_mov_b32_e32 v0, s4 5714; GCN1-NEXT: v_mov_b32_e32 v1, s5 5715; GCN1-NEXT: flat_load_dword v3, v[0:1] 5716; GCN1-NEXT: s_mov_b64 s[34:35], 0 5717; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start 5718; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5719; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5720; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 5721; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5722; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5723; GCN1-NEXT: buffer_wbinvl1_vol 5724; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5725; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5726; GCN1-NEXT: v_mov_b32_e32 v3, v2 5727; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 5728; GCN1-NEXT: s_cbranch_execnz .LBB111_1 5729; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5730; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 5731; GCN1-NEXT: s_setpc_b64 s[30:31] 5732; 5733; GCN2-LABEL: flat_atomic_umin_i32_noret_scalar: 5734; GCN2: ; %bb.0: 5735; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5736; GCN2-NEXT: v_mov_b32_e32 v0, s4 5737; GCN2-NEXT: v_mov_b32_e32 v1, s5 5738; GCN2-NEXT: flat_load_dword v3, v[0:1] 5739; GCN2-NEXT: s_mov_b64 s[34:35], 0 5740; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start 5741; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5742; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5743; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 5744; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5745; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5746; GCN2-NEXT: buffer_wbinvl1_vol 5747; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5748; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5749; GCN2-NEXT: v_mov_b32_e32 v3, v2 5750; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 5751; GCN2-NEXT: s_cbranch_execnz .LBB111_1 5752; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5753; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 5754; GCN2-NEXT: s_setpc_b64 s[30:31] 5755; 5756; GCN3-LABEL: flat_atomic_umin_i32_noret_scalar: 5757; GCN3: ; %bb.0: 5758; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5759; GCN3-NEXT: v_mov_b32_e32 v0, s4 5760; GCN3-NEXT: v_mov_b32_e32 v1, s5 5761; GCN3-NEXT: flat_load_dword v3, v[0:1] 5762; GCN3-NEXT: s_mov_b64 s[34:35], 0 5763; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start 5764; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5765; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5766; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 5767; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5768; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5769; GCN3-NEXT: buffer_wbinvl1_vol 5770; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5771; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5772; GCN3-NEXT: v_mov_b32_e32 v3, v2 5773; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 5774; GCN3-NEXT: s_cbranch_execnz .LBB111_1 5775; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5776; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 5777; GCN3-NEXT: s_setpc_b64 s[30:31] 5778 %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst 5779 ret void 5780} 5781 5782define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 5783; GCN1-LABEL: flat_atomic_umin_i32_noret_offset_scalar: 5784; GCN1: ; %bb.0: 5785; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5786; GCN1-NEXT: s_add_u32 s34, s4, 16 5787; GCN1-NEXT: s_addc_u32 s35, s5, 0 5788; GCN1-NEXT: v_mov_b32_e32 v0, s34 5789; GCN1-NEXT: v_mov_b32_e32 v1, s35 5790; GCN1-NEXT: flat_load_dword v3, v[0:1] 5791; GCN1-NEXT: s_mov_b64 s[34:35], 0 5792; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start 5793; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5794; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5795; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 5796; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5797; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5798; GCN1-NEXT: buffer_wbinvl1_vol 5799; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5800; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5801; GCN1-NEXT: v_mov_b32_e32 v3, v2 5802; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 5803; GCN1-NEXT: s_cbranch_execnz .LBB112_1 5804; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5805; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 5806; GCN1-NEXT: s_setpc_b64 s[30:31] 5807; 5808; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar: 5809; GCN2: ; %bb.0: 5810; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5811; GCN2-NEXT: s_add_u32 s34, s4, 16 5812; GCN2-NEXT: s_addc_u32 s35, s5, 0 5813; GCN2-NEXT: v_mov_b32_e32 v0, s34 5814; GCN2-NEXT: v_mov_b32_e32 v1, s35 5815; GCN2-NEXT: flat_load_dword v3, v[0:1] 5816; GCN2-NEXT: s_mov_b64 s[34:35], 0 5817; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start 5818; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5819; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5820; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 5821; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5822; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5823; GCN2-NEXT: buffer_wbinvl1_vol 5824; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5825; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5826; GCN2-NEXT: v_mov_b32_e32 v3, v2 5827; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 5828; GCN2-NEXT: s_cbranch_execnz .LBB112_1 5829; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5830; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 5831; GCN2-NEXT: s_setpc_b64 s[30:31] 5832; 5833; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar: 5834; GCN3: ; %bb.0: 5835; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5836; GCN3-NEXT: v_mov_b32_e32 v0, s4 5837; GCN3-NEXT: v_mov_b32_e32 v1, s5 5838; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 5839; GCN3-NEXT: s_mov_b64 s[34:35], 0 5840; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start 5841; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5842; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5843; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 5844; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5845; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5846; GCN3-NEXT: buffer_wbinvl1_vol 5847; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 5848; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5849; GCN3-NEXT: v_mov_b32_e32 v3, v2 5850; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 5851; GCN3-NEXT: s_cbranch_execnz .LBB112_1 5852; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5853; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 5854; GCN3-NEXT: s_setpc_b64 s[30:31] 5855 %gep = getelementptr i32, ptr %out, i32 4 5856 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst 5857 ret void 5858} 5859 5860define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 5861; GCN1-LABEL: flat_atomic_umin_i32_ret_scalar: 5862; GCN1: ; %bb.0: 5863; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5864; GCN1-NEXT: v_mov_b32_e32 v0, s4 5865; GCN1-NEXT: v_mov_b32_e32 v1, s5 5866; GCN1-NEXT: flat_load_dword v0, v[0:1] 5867; GCN1-NEXT: v_mov_b32_e32 v1, s4 5868; GCN1-NEXT: s_mov_b64 s[34:35], 0 5869; GCN1-NEXT: v_mov_b32_e32 v2, s5 5870; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start 5871; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5872; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5873; GCN1-NEXT: v_mov_b32_e32 v4, v0 5874; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 5875; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5876; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5877; GCN1-NEXT: buffer_wbinvl1_vol 5878; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5879; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5880; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 5881; GCN1-NEXT: s_cbranch_execnz .LBB113_1 5882; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5883; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 5884; GCN1-NEXT: s_setpc_b64 s[30:31] 5885; 5886; GCN2-LABEL: flat_atomic_umin_i32_ret_scalar: 5887; GCN2: ; %bb.0: 5888; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5889; GCN2-NEXT: v_mov_b32_e32 v0, s4 5890; GCN2-NEXT: v_mov_b32_e32 v1, s5 5891; GCN2-NEXT: flat_load_dword v0, v[0:1] 5892; GCN2-NEXT: v_mov_b32_e32 v1, s4 5893; GCN2-NEXT: s_mov_b64 s[34:35], 0 5894; GCN2-NEXT: v_mov_b32_e32 v2, s5 5895; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start 5896; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5897; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5898; GCN2-NEXT: v_mov_b32_e32 v4, v0 5899; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 5900; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5901; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5902; GCN2-NEXT: buffer_wbinvl1_vol 5903; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5904; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5905; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 5906; GCN2-NEXT: s_cbranch_execnz .LBB113_1 5907; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5908; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 5909; GCN2-NEXT: s_setpc_b64 s[30:31] 5910; 5911; GCN3-LABEL: flat_atomic_umin_i32_ret_scalar: 5912; GCN3: ; %bb.0: 5913; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5914; GCN3-NEXT: v_mov_b32_e32 v0, s4 5915; GCN3-NEXT: v_mov_b32_e32 v1, s5 5916; GCN3-NEXT: flat_load_dword v0, v[0:1] 5917; GCN3-NEXT: v_mov_b32_e32 v1, s4 5918; GCN3-NEXT: s_mov_b64 s[34:35], 0 5919; GCN3-NEXT: v_mov_b32_e32 v2, s5 5920; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start 5921; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 5922; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5923; GCN3-NEXT: v_mov_b32_e32 v4, v0 5924; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 5925; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5926; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5927; GCN3-NEXT: buffer_wbinvl1_vol 5928; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5929; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5930; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 5931; GCN3-NEXT: s_cbranch_execnz .LBB113_1 5932; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 5933; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 5934; GCN3-NEXT: s_setpc_b64 s[30:31] 5935 %result = atomicrmw umin ptr %ptr, i32 %in seq_cst 5936 ret i32 %result 5937} 5938 5939define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 5940; GCN1-LABEL: flat_atomic_umin_i32_ret_offset_scalar: 5941; GCN1: ; %bb.0: 5942; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5943; GCN1-NEXT: s_add_u32 s34, s4, 16 5944; GCN1-NEXT: s_addc_u32 s35, s5, 0 5945; GCN1-NEXT: v_mov_b32_e32 v1, s34 5946; GCN1-NEXT: v_mov_b32_e32 v2, s35 5947; GCN1-NEXT: flat_load_dword v0, v[1:2] 5948; GCN1-NEXT: s_mov_b64 s[34:35], 0 5949; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start 5950; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 5951; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5952; GCN1-NEXT: v_mov_b32_e32 v4, v0 5953; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 5954; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5955; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5956; GCN1-NEXT: buffer_wbinvl1_vol 5957; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5958; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5959; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 5960; GCN1-NEXT: s_cbranch_execnz .LBB114_1 5961; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 5962; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 5963; GCN1-NEXT: s_setpc_b64 s[30:31] 5964; 5965; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar: 5966; GCN2: ; %bb.0: 5967; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5968; GCN2-NEXT: s_add_u32 s34, s4, 16 5969; GCN2-NEXT: s_addc_u32 s35, s5, 0 5970; GCN2-NEXT: v_mov_b32_e32 v1, s34 5971; GCN2-NEXT: v_mov_b32_e32 v2, s35 5972; GCN2-NEXT: flat_load_dword v0, v[1:2] 5973; GCN2-NEXT: s_mov_b64 s[34:35], 0 5974; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start 5975; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 5976; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5977; GCN2-NEXT: v_mov_b32_e32 v4, v0 5978; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 5979; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 5980; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5981; GCN2-NEXT: buffer_wbinvl1_vol 5982; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 5983; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 5984; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 5985; GCN2-NEXT: s_cbranch_execnz .LBB114_1 5986; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 5987; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 5988; GCN2-NEXT: s_setpc_b64 s[30:31] 5989; 5990; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar: 5991; GCN3: ; %bb.0: 5992; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5993; GCN3-NEXT: v_mov_b32_e32 v0, s4 5994; GCN3-NEXT: v_mov_b32_e32 v1, s5 5995; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 5996; GCN3-NEXT: v_mov_b32_e32 v1, s4 5997; GCN3-NEXT: s_mov_b64 s[34:35], 0 5998; GCN3-NEXT: v_mov_b32_e32 v2, s5 5999; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start 6000; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6001; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6002; GCN3-NEXT: v_mov_b32_e32 v4, v0 6003; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 6004; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc 6005; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6006; GCN3-NEXT: buffer_wbinvl1_vol 6007; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6008; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6009; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 6010; GCN3-NEXT: s_cbranch_execnz .LBB114_1 6011; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6012; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 6013; GCN3-NEXT: s_setpc_b64 s[30:31] 6014 %gep = getelementptr i32, ptr %out, i32 4 6015 %result = atomicrmw umin ptr %gep, i32 %in seq_cst 6016 ret i32 %result 6017} 6018 6019define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 6020; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: 6021; GCN1: ; %bb.0: 6022; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6023; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 6024; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6025; GCN1-NEXT: flat_load_dword v4, v[0:1] 6026; GCN1-NEXT: s_mov_b64 s[4:5], 0 6027; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start 6028; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6029; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6030; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 6031; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6032; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6033; GCN1-NEXT: buffer_wbinvl1_vol 6034; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6035; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6036; GCN1-NEXT: v_mov_b32_e32 v4, v3 6037; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 6038; GCN1-NEXT: s_cbranch_execnz .LBB115_1 6039; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6040; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 6041; GCN1-NEXT: s_setpc_b64 s[30:31] 6042; 6043; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: 6044; GCN2: ; %bb.0: 6045; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6046; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 6047; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6048; GCN2-NEXT: flat_load_dword v4, v[0:1] 6049; GCN2-NEXT: s_mov_b64 s[4:5], 0 6050; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start 6051; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6052; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6053; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 6054; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6055; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6056; GCN2-NEXT: buffer_wbinvl1_vol 6057; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6058; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6059; GCN2-NEXT: v_mov_b32_e32 v4, v3 6060; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 6061; GCN2-NEXT: s_cbranch_execnz .LBB115_1 6062; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6063; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 6064; GCN2-NEXT: s_setpc_b64 s[30:31] 6065; 6066; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: 6067; GCN3: ; %bb.0: 6068; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6069; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 6070; GCN3-NEXT: s_mov_b64 s[4:5], 0 6071; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start 6072; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6073; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6074; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 6075; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 6076; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6077; GCN3-NEXT: buffer_wbinvl1_vol 6078; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6079; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6080; GCN3-NEXT: v_mov_b32_e32 v4, v3 6081; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 6082; GCN3-NEXT: s_cbranch_execnz .LBB115_1 6083; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6084; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 6085; GCN3-NEXT: s_setpc_b64 s[30:31] 6086 %gep = getelementptr i32, ptr %out, i64 4 6087 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 6088 ret void 6089} 6090 6091define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 6092; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: 6093; GCN1: ; %bb.0: 6094; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6095; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 6096; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 6097; GCN1-NEXT: flat_load_dword v0, v[3:4] 6098; GCN1-NEXT: s_mov_b64 s[4:5], 0 6099; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start 6100; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6101; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6102; GCN1-NEXT: v_mov_b32_e32 v1, v0 6103; GCN1-NEXT: v_min_u32_e32 v0, v1, v2 6104; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 6105; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6106; GCN1-NEXT: buffer_wbinvl1_vol 6107; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6108; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6109; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 6110; GCN1-NEXT: s_cbranch_execnz .LBB116_1 6111; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6112; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 6113; GCN1-NEXT: s_setpc_b64 s[30:31] 6114; 6115; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: 6116; GCN2: ; %bb.0: 6117; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6118; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 6119; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 6120; GCN2-NEXT: flat_load_dword v0, v[3:4] 6121; GCN2-NEXT: s_mov_b64 s[4:5], 0 6122; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start 6123; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6124; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6125; GCN2-NEXT: v_mov_b32_e32 v1, v0 6126; GCN2-NEXT: v_min_u32_e32 v0, v1, v2 6127; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 6128; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6129; GCN2-NEXT: buffer_wbinvl1_vol 6130; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6131; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6132; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 6133; GCN2-NEXT: s_cbranch_execnz .LBB116_1 6134; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6135; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 6136; GCN2-NEXT: s_setpc_b64 s[30:31] 6137; 6138; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: 6139; GCN3: ; %bb.0: 6140; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6141; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 6142; GCN3-NEXT: s_mov_b64 s[4:5], 0 6143; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start 6144; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6145; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6146; GCN3-NEXT: v_mov_b32_e32 v4, v3 6147; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 6148; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 6149; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6150; GCN3-NEXT: buffer_wbinvl1_vol 6151; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6152; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6153; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 6154; GCN3-NEXT: s_cbranch_execnz .LBB116_1 6155; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6156; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 6157; GCN3-NEXT: v_mov_b32_e32 v0, v3 6158; GCN3-NEXT: s_setpc_b64 s[30:31] 6159 %gep = getelementptr i32, ptr %out, i64 4 6160 %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 6161 ret i32 %result 6162} 6163 6164; --------------------------------------------------------------------- 6165; atomicrmw min 6166; --------------------------------------------------------------------- 6167 6168define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { 6169; GCN1-LABEL: flat_atomic_min_i32_noret: 6170; GCN1: ; %bb.0: 6171; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6172; GCN1-NEXT: flat_load_dword v4, v[0:1] 6173; GCN1-NEXT: s_mov_b64 s[4:5], 0 6174; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start 6175; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6176; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6177; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 6178; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6179; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6180; GCN1-NEXT: buffer_wbinvl1_vol 6181; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6182; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6183; GCN1-NEXT: v_mov_b32_e32 v4, v3 6184; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 6185; GCN1-NEXT: s_cbranch_execnz .LBB117_1 6186; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6187; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 6188; GCN1-NEXT: s_setpc_b64 s[30:31] 6189; 6190; GCN2-LABEL: flat_atomic_min_i32_noret: 6191; GCN2: ; %bb.0: 6192; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6193; GCN2-NEXT: flat_load_dword v4, v[0:1] 6194; GCN2-NEXT: s_mov_b64 s[4:5], 0 6195; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start 6196; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6197; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6198; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 6199; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6200; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6201; GCN2-NEXT: buffer_wbinvl1_vol 6202; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6203; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6204; GCN2-NEXT: v_mov_b32_e32 v4, v3 6205; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 6206; GCN2-NEXT: s_cbranch_execnz .LBB117_1 6207; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6208; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 6209; GCN2-NEXT: s_setpc_b64 s[30:31] 6210; 6211; GCN3-LABEL: flat_atomic_min_i32_noret: 6212; GCN3: ; %bb.0: 6213; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6214; GCN3-NEXT: flat_load_dword v4, v[0:1] 6215; GCN3-NEXT: s_mov_b64 s[4:5], 0 6216; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start 6217; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6218; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6219; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 6220; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6221; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6222; GCN3-NEXT: buffer_wbinvl1_vol 6223; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6224; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6225; GCN3-NEXT: v_mov_b32_e32 v4, v3 6226; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 6227; GCN3-NEXT: s_cbranch_execnz .LBB117_1 6228; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6229; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 6230; GCN3-NEXT: s_setpc_b64 s[30:31] 6231 %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst 6232 ret void 6233} 6234 6235define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { 6236; GCN1-LABEL: flat_atomic_min_i32_noret_offset: 6237; GCN1: ; %bb.0: 6238; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6239; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 6240; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6241; GCN1-NEXT: flat_load_dword v4, v[0:1] 6242; GCN1-NEXT: s_mov_b64 s[4:5], 0 6243; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start 6244; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6245; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6246; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 6247; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6248; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6249; GCN1-NEXT: buffer_wbinvl1_vol 6250; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6251; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6252; GCN1-NEXT: v_mov_b32_e32 v4, v3 6253; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 6254; GCN1-NEXT: s_cbranch_execnz .LBB118_1 6255; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6256; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 6257; GCN1-NEXT: s_setpc_b64 s[30:31] 6258; 6259; GCN2-LABEL: flat_atomic_min_i32_noret_offset: 6260; GCN2: ; %bb.0: 6261; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6262; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 6263; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6264; GCN2-NEXT: flat_load_dword v4, v[0:1] 6265; GCN2-NEXT: s_mov_b64 s[4:5], 0 6266; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start 6267; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6268; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6269; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 6270; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6271; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6272; GCN2-NEXT: buffer_wbinvl1_vol 6273; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6274; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6275; GCN2-NEXT: v_mov_b32_e32 v4, v3 6276; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 6277; GCN2-NEXT: s_cbranch_execnz .LBB118_1 6278; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6279; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 6280; GCN2-NEXT: s_setpc_b64 s[30:31] 6281; 6282; GCN3-LABEL: flat_atomic_min_i32_noret_offset: 6283; GCN3: ; %bb.0: 6284; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6285; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 6286; GCN3-NEXT: s_mov_b64 s[4:5], 0 6287; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start 6288; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6289; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6290; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 6291; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 6292; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6293; GCN3-NEXT: buffer_wbinvl1_vol 6294; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6295; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6296; GCN3-NEXT: v_mov_b32_e32 v4, v3 6297; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 6298; GCN3-NEXT: s_cbranch_execnz .LBB118_1 6299; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6300; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 6301; GCN3-NEXT: s_setpc_b64 s[30:31] 6302 %gep = getelementptr i32, ptr %out, i32 4 6303 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst 6304 ret void 6305} 6306 6307define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { 6308; GCN1-LABEL: flat_atomic_min_i32_ret: 6309; GCN1: ; %bb.0: 6310; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6311; GCN1-NEXT: flat_load_dword v3, v[0:1] 6312; GCN1-NEXT: s_mov_b64 s[4:5], 0 6313; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start 6314; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6315; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6316; GCN1-NEXT: v_mov_b32_e32 v4, v3 6317; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 6318; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6319; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6320; GCN1-NEXT: buffer_wbinvl1_vol 6321; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6322; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6323; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 6324; GCN1-NEXT: s_cbranch_execnz .LBB119_1 6325; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6326; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 6327; GCN1-NEXT: v_mov_b32_e32 v0, v3 6328; GCN1-NEXT: s_setpc_b64 s[30:31] 6329; 6330; GCN2-LABEL: flat_atomic_min_i32_ret: 6331; GCN2: ; %bb.0: 6332; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6333; GCN2-NEXT: flat_load_dword v3, v[0:1] 6334; GCN2-NEXT: s_mov_b64 s[4:5], 0 6335; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start 6336; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6337; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6338; GCN2-NEXT: v_mov_b32_e32 v4, v3 6339; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 6340; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6341; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6342; GCN2-NEXT: buffer_wbinvl1_vol 6343; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6344; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6345; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 6346; GCN2-NEXT: s_cbranch_execnz .LBB119_1 6347; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6348; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 6349; GCN2-NEXT: v_mov_b32_e32 v0, v3 6350; GCN2-NEXT: s_setpc_b64 s[30:31] 6351; 6352; GCN3-LABEL: flat_atomic_min_i32_ret: 6353; GCN3: ; %bb.0: 6354; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6355; GCN3-NEXT: flat_load_dword v3, v[0:1] 6356; GCN3-NEXT: s_mov_b64 s[4:5], 0 6357; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start 6358; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6359; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6360; GCN3-NEXT: v_mov_b32_e32 v4, v3 6361; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 6362; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6363; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6364; GCN3-NEXT: buffer_wbinvl1_vol 6365; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6366; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6367; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 6368; GCN3-NEXT: s_cbranch_execnz .LBB119_1 6369; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6370; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 6371; GCN3-NEXT: v_mov_b32_e32 v0, v3 6372; GCN3-NEXT: s_setpc_b64 s[30:31] 6373 %result = atomicrmw min ptr %ptr, i32 %in seq_cst 6374 ret i32 %result 6375} 6376 6377define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { 6378; GCN1-LABEL: flat_atomic_min_i32_ret_offset: 6379; GCN1: ; %bb.0: 6380; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6381; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 6382; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 6383; GCN1-NEXT: flat_load_dword v0, v[3:4] 6384; GCN1-NEXT: s_mov_b64 s[4:5], 0 6385; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start 6386; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6387; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6388; GCN1-NEXT: v_mov_b32_e32 v1, v0 6389; GCN1-NEXT: v_min_i32_e32 v0, v1, v2 6390; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 6391; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6392; GCN1-NEXT: buffer_wbinvl1_vol 6393; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6394; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6395; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 6396; GCN1-NEXT: s_cbranch_execnz .LBB120_1 6397; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6398; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 6399; GCN1-NEXT: s_setpc_b64 s[30:31] 6400; 6401; GCN2-LABEL: flat_atomic_min_i32_ret_offset: 6402; GCN2: ; %bb.0: 6403; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6404; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 6405; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 6406; GCN2-NEXT: flat_load_dword v0, v[3:4] 6407; GCN2-NEXT: s_mov_b64 s[4:5], 0 6408; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start 6409; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6410; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6411; GCN2-NEXT: v_mov_b32_e32 v1, v0 6412; GCN2-NEXT: v_min_i32_e32 v0, v1, v2 6413; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 6414; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6415; GCN2-NEXT: buffer_wbinvl1_vol 6416; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 6417; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6418; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 6419; GCN2-NEXT: s_cbranch_execnz .LBB120_1 6420; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6421; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 6422; GCN2-NEXT: s_setpc_b64 s[30:31] 6423; 6424; GCN3-LABEL: flat_atomic_min_i32_ret_offset: 6425; GCN3: ; %bb.0: 6426; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6427; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 6428; GCN3-NEXT: s_mov_b64 s[4:5], 0 6429; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start 6430; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6431; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6432; GCN3-NEXT: v_mov_b32_e32 v4, v3 6433; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 6434; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 6435; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6436; GCN3-NEXT: buffer_wbinvl1_vol 6437; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6438; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6439; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 6440; GCN3-NEXT: s_cbranch_execnz .LBB120_1 6441; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6442; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 6443; GCN3-NEXT: v_mov_b32_e32 v0, v3 6444; GCN3-NEXT: s_setpc_b64 s[30:31] 6445 %gep = getelementptr i32, ptr %out, i32 4 6446 %result = atomicrmw min ptr %gep, i32 %in seq_cst 6447 ret i32 %result 6448} 6449 6450define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 6451; GCN1-LABEL: flat_atomic_min_i32_noret_scalar: 6452; GCN1: ; %bb.0: 6453; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6454; GCN1-NEXT: v_mov_b32_e32 v0, s4 6455; GCN1-NEXT: v_mov_b32_e32 v1, s5 6456; GCN1-NEXT: flat_load_dword v3, v[0:1] 6457; GCN1-NEXT: s_mov_b64 s[34:35], 0 6458; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start 6459; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6460; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6461; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 6462; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6463; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6464; GCN1-NEXT: buffer_wbinvl1_vol 6465; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6466; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6467; GCN1-NEXT: v_mov_b32_e32 v3, v2 6468; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 6469; GCN1-NEXT: s_cbranch_execnz .LBB121_1 6470; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6471; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 6472; GCN1-NEXT: s_setpc_b64 s[30:31] 6473; 6474; GCN2-LABEL: flat_atomic_min_i32_noret_scalar: 6475; GCN2: ; %bb.0: 6476; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6477; GCN2-NEXT: v_mov_b32_e32 v0, s4 6478; GCN2-NEXT: v_mov_b32_e32 v1, s5 6479; GCN2-NEXT: flat_load_dword v3, v[0:1] 6480; GCN2-NEXT: s_mov_b64 s[34:35], 0 6481; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start 6482; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6483; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6484; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 6485; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6486; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6487; GCN2-NEXT: buffer_wbinvl1_vol 6488; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6489; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6490; GCN2-NEXT: v_mov_b32_e32 v3, v2 6491; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 6492; GCN2-NEXT: s_cbranch_execnz .LBB121_1 6493; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6494; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 6495; GCN2-NEXT: s_setpc_b64 s[30:31] 6496; 6497; GCN3-LABEL: flat_atomic_min_i32_noret_scalar: 6498; GCN3: ; %bb.0: 6499; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6500; GCN3-NEXT: v_mov_b32_e32 v0, s4 6501; GCN3-NEXT: v_mov_b32_e32 v1, s5 6502; GCN3-NEXT: flat_load_dword v3, v[0:1] 6503; GCN3-NEXT: s_mov_b64 s[34:35], 0 6504; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start 6505; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6506; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6507; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 6508; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6509; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6510; GCN3-NEXT: buffer_wbinvl1_vol 6511; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6512; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6513; GCN3-NEXT: v_mov_b32_e32 v3, v2 6514; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 6515; GCN3-NEXT: s_cbranch_execnz .LBB121_1 6516; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6517; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 6518; GCN3-NEXT: s_setpc_b64 s[30:31] 6519 %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst 6520 ret void 6521} 6522 6523define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 6524; GCN1-LABEL: flat_atomic_min_i32_noret_offset_scalar: 6525; GCN1: ; %bb.0: 6526; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6527; GCN1-NEXT: s_add_u32 s34, s4, 16 6528; GCN1-NEXT: s_addc_u32 s35, s5, 0 6529; GCN1-NEXT: v_mov_b32_e32 v0, s34 6530; GCN1-NEXT: v_mov_b32_e32 v1, s35 6531; GCN1-NEXT: flat_load_dword v3, v[0:1] 6532; GCN1-NEXT: s_mov_b64 s[34:35], 0 6533; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start 6534; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6535; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6536; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 6537; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6538; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6539; GCN1-NEXT: buffer_wbinvl1_vol 6540; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6541; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6542; GCN1-NEXT: v_mov_b32_e32 v3, v2 6543; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 6544; GCN1-NEXT: s_cbranch_execnz .LBB122_1 6545; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6546; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 6547; GCN1-NEXT: s_setpc_b64 s[30:31] 6548; 6549; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar: 6550; GCN2: ; %bb.0: 6551; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6552; GCN2-NEXT: s_add_u32 s34, s4, 16 6553; GCN2-NEXT: s_addc_u32 s35, s5, 0 6554; GCN2-NEXT: v_mov_b32_e32 v0, s34 6555; GCN2-NEXT: v_mov_b32_e32 v1, s35 6556; GCN2-NEXT: flat_load_dword v3, v[0:1] 6557; GCN2-NEXT: s_mov_b64 s[34:35], 0 6558; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start 6559; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6560; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6561; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 6562; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6563; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6564; GCN2-NEXT: buffer_wbinvl1_vol 6565; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6566; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6567; GCN2-NEXT: v_mov_b32_e32 v3, v2 6568; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 6569; GCN2-NEXT: s_cbranch_execnz .LBB122_1 6570; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6571; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 6572; GCN2-NEXT: s_setpc_b64 s[30:31] 6573; 6574; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar: 6575; GCN3: ; %bb.0: 6576; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6577; GCN3-NEXT: v_mov_b32_e32 v0, s4 6578; GCN3-NEXT: v_mov_b32_e32 v1, s5 6579; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 6580; GCN3-NEXT: s_mov_b64 s[34:35], 0 6581; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start 6582; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6583; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6584; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 6585; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6586; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6587; GCN3-NEXT: buffer_wbinvl1_vol 6588; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6589; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6590; GCN3-NEXT: v_mov_b32_e32 v3, v2 6591; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 6592; GCN3-NEXT: s_cbranch_execnz .LBB122_1 6593; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6594; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 6595; GCN3-NEXT: s_setpc_b64 s[30:31] 6596 %gep = getelementptr i32, ptr %out, i32 4 6597 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst 6598 ret void 6599} 6600 6601define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 6602; GCN1-LABEL: flat_atomic_min_i32_ret_scalar: 6603; GCN1: ; %bb.0: 6604; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6605; GCN1-NEXT: v_mov_b32_e32 v0, s4 6606; GCN1-NEXT: v_mov_b32_e32 v1, s5 6607; GCN1-NEXT: flat_load_dword v0, v[0:1] 6608; GCN1-NEXT: v_mov_b32_e32 v1, s4 6609; GCN1-NEXT: s_mov_b64 s[34:35], 0 6610; GCN1-NEXT: v_mov_b32_e32 v2, s5 6611; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start 6612; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6613; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6614; GCN1-NEXT: v_mov_b32_e32 v4, v0 6615; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 6616; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6617; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6618; GCN1-NEXT: buffer_wbinvl1_vol 6619; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6620; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6621; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 6622; GCN1-NEXT: s_cbranch_execnz .LBB123_1 6623; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6624; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 6625; GCN1-NEXT: s_setpc_b64 s[30:31] 6626; 6627; GCN2-LABEL: flat_atomic_min_i32_ret_scalar: 6628; GCN2: ; %bb.0: 6629; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6630; GCN2-NEXT: v_mov_b32_e32 v0, s4 6631; GCN2-NEXT: v_mov_b32_e32 v1, s5 6632; GCN2-NEXT: flat_load_dword v0, v[0:1] 6633; GCN2-NEXT: v_mov_b32_e32 v1, s4 6634; GCN2-NEXT: s_mov_b64 s[34:35], 0 6635; GCN2-NEXT: v_mov_b32_e32 v2, s5 6636; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start 6637; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6638; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6639; GCN2-NEXT: v_mov_b32_e32 v4, v0 6640; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 6641; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6642; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6643; GCN2-NEXT: buffer_wbinvl1_vol 6644; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6645; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6646; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 6647; GCN2-NEXT: s_cbranch_execnz .LBB123_1 6648; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6649; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 6650; GCN2-NEXT: s_setpc_b64 s[30:31] 6651; 6652; GCN3-LABEL: flat_atomic_min_i32_ret_scalar: 6653; GCN3: ; %bb.0: 6654; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6655; GCN3-NEXT: v_mov_b32_e32 v0, s4 6656; GCN3-NEXT: v_mov_b32_e32 v1, s5 6657; GCN3-NEXT: flat_load_dword v0, v[0:1] 6658; GCN3-NEXT: v_mov_b32_e32 v1, s4 6659; GCN3-NEXT: s_mov_b64 s[34:35], 0 6660; GCN3-NEXT: v_mov_b32_e32 v2, s5 6661; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start 6662; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6663; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6664; GCN3-NEXT: v_mov_b32_e32 v4, v0 6665; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 6666; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6667; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6668; GCN3-NEXT: buffer_wbinvl1_vol 6669; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6670; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6671; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 6672; GCN3-NEXT: s_cbranch_execnz .LBB123_1 6673; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6674; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 6675; GCN3-NEXT: s_setpc_b64 s[30:31] 6676 %result = atomicrmw min ptr %ptr, i32 %in seq_cst 6677 ret i32 %result 6678} 6679 6680define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 6681; GCN1-LABEL: flat_atomic_min_i32_ret_offset_scalar: 6682; GCN1: ; %bb.0: 6683; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6684; GCN1-NEXT: s_add_u32 s34, s4, 16 6685; GCN1-NEXT: s_addc_u32 s35, s5, 0 6686; GCN1-NEXT: v_mov_b32_e32 v1, s34 6687; GCN1-NEXT: v_mov_b32_e32 v2, s35 6688; GCN1-NEXT: flat_load_dword v0, v[1:2] 6689; GCN1-NEXT: s_mov_b64 s[34:35], 0 6690; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start 6691; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6692; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6693; GCN1-NEXT: v_mov_b32_e32 v4, v0 6694; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 6695; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6696; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6697; GCN1-NEXT: buffer_wbinvl1_vol 6698; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6699; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6700; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] 6701; GCN1-NEXT: s_cbranch_execnz .LBB124_1 6702; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6703; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] 6704; GCN1-NEXT: s_setpc_b64 s[30:31] 6705; 6706; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar: 6707; GCN2: ; %bb.0: 6708; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6709; GCN2-NEXT: s_add_u32 s34, s4, 16 6710; GCN2-NEXT: s_addc_u32 s35, s5, 0 6711; GCN2-NEXT: v_mov_b32_e32 v1, s34 6712; GCN2-NEXT: v_mov_b32_e32 v2, s35 6713; GCN2-NEXT: flat_load_dword v0, v[1:2] 6714; GCN2-NEXT: s_mov_b64 s[34:35], 0 6715; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start 6716; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6717; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6718; GCN2-NEXT: v_mov_b32_e32 v4, v0 6719; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 6720; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc 6721; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6722; GCN2-NEXT: buffer_wbinvl1_vol 6723; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6724; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6725; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] 6726; GCN2-NEXT: s_cbranch_execnz .LBB124_1 6727; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6728; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] 6729; GCN2-NEXT: s_setpc_b64 s[30:31] 6730; 6731; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar: 6732; GCN3: ; %bb.0: 6733; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6734; GCN3-NEXT: v_mov_b32_e32 v0, s4 6735; GCN3-NEXT: v_mov_b32_e32 v1, s5 6736; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 6737; GCN3-NEXT: v_mov_b32_e32 v1, s4 6738; GCN3-NEXT: s_mov_b64 s[34:35], 0 6739; GCN3-NEXT: v_mov_b32_e32 v2, s5 6740; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start 6741; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6742; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6743; GCN3-NEXT: v_mov_b32_e32 v4, v0 6744; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 6745; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc 6746; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6747; GCN3-NEXT: buffer_wbinvl1_vol 6748; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 6749; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] 6750; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] 6751; GCN3-NEXT: s_cbranch_execnz .LBB124_1 6752; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6753; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] 6754; GCN3-NEXT: s_setpc_b64 s[30:31] 6755 %gep = getelementptr i32, ptr %out, i32 4 6756 %result = atomicrmw min ptr %gep, i32 %in seq_cst 6757 ret i32 %result 6758} 6759 6760define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { 6761; GCN1-LABEL: atomic_min_i32_addr64_offset: 6762; GCN1: ; %bb.0: ; %entry 6763; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6764; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6765; GCN1-NEXT: s_ashr_i32 s5, s3, 31 6766; GCN1-NEXT: s_mov_b32 s4, s3 6767; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6768; GCN1-NEXT: s_add_u32 s0, s0, s4 6769; GCN1-NEXT: s_addc_u32 s1, s1, s5 6770; GCN1-NEXT: s_add_u32 s0, s0, 16 6771; GCN1-NEXT: s_addc_u32 s1, s1, 0 6772; GCN1-NEXT: v_mov_b32_e32 v0, s0 6773; GCN1-NEXT: v_mov_b32_e32 v1, s1 6774; GCN1-NEXT: flat_load_dword v3, v[0:1] 6775; GCN1-NEXT: s_mov_b64 s[0:1], 0 6776; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start 6777; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6778; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6779; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 6780; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6781; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6782; GCN1-NEXT: buffer_wbinvl1_vol 6783; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6784; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6785; GCN1-NEXT: v_mov_b32_e32 v3, v2 6786; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 6787; GCN1-NEXT: s_cbranch_execnz .LBB125_1 6788; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6789; GCN1-NEXT: s_endpgm 6790; 6791; GCN2-LABEL: atomic_min_i32_addr64_offset: 6792; GCN2: ; %bb.0: ; %entry 6793; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6794; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6795; GCN2-NEXT: s_ashr_i32 s5, s3, 31 6796; GCN2-NEXT: s_mov_b32 s4, s3 6797; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6798; GCN2-NEXT: s_add_u32 s0, s0, s4 6799; GCN2-NEXT: s_addc_u32 s1, s1, s5 6800; GCN2-NEXT: s_add_u32 s0, s0, 16 6801; GCN2-NEXT: s_addc_u32 s1, s1, 0 6802; GCN2-NEXT: v_mov_b32_e32 v0, s0 6803; GCN2-NEXT: v_mov_b32_e32 v1, s1 6804; GCN2-NEXT: flat_load_dword v3, v[0:1] 6805; GCN2-NEXT: s_mov_b64 s[0:1], 0 6806; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start 6807; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6808; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6809; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 6810; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6811; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6812; GCN2-NEXT: buffer_wbinvl1_vol 6813; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6814; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6815; GCN2-NEXT: v_mov_b32_e32 v3, v2 6816; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 6817; GCN2-NEXT: s_cbranch_execnz .LBB125_1 6818; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6819; GCN2-NEXT: s_endpgm 6820; 6821; GCN3-LABEL: atomic_min_i32_addr64_offset: 6822; GCN3: ; %bb.0: ; %entry 6823; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6824; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6825; GCN3-NEXT: s_ashr_i32 s5, s3, 31 6826; GCN3-NEXT: s_mov_b32 s4, s3 6827; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6828; GCN3-NEXT: s_add_u32 s0, s0, s4 6829; GCN3-NEXT: s_addc_u32 s1, s1, s5 6830; GCN3-NEXT: v_mov_b32_e32 v0, s0 6831; GCN3-NEXT: v_mov_b32_e32 v1, s1 6832; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 6833; GCN3-NEXT: s_mov_b64 s[0:1], 0 6834; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start 6835; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6836; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6837; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 6838; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6839; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6840; GCN3-NEXT: buffer_wbinvl1_vol 6841; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6842; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6843; GCN3-NEXT: v_mov_b32_e32 v3, v2 6844; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 6845; GCN3-NEXT: s_cbranch_execnz .LBB125_1 6846; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6847; GCN3-NEXT: s_endpgm 6848entry: 6849 %ptr = getelementptr i32, ptr %out, i32 %index 6850 %gep = getelementptr i32, ptr %ptr, i32 4 6851 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst 6852 ret void 6853} 6854 6855define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { 6856; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: 6857; GCN1: ; %bb.0: ; %entry 6858; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 6859; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6860; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6861; GCN1-NEXT: s_ashr_i32 s5, s7, 31 6862; GCN1-NEXT: s_mov_b32 s4, s7 6863; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6864; GCN1-NEXT: s_add_u32 s0, s0, s4 6865; GCN1-NEXT: s_addc_u32 s1, s1, s5 6866; GCN1-NEXT: s_add_u32 s0, s0, 16 6867; GCN1-NEXT: s_addc_u32 s1, s1, 0 6868; GCN1-NEXT: v_mov_b32_e32 v0, s0 6869; GCN1-NEXT: v_mov_b32_e32 v1, s1 6870; GCN1-NEXT: flat_load_dword v2, v[0:1] 6871; GCN1-NEXT: s_mov_b64 s[0:1], 0 6872; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start 6873; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6874; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6875; GCN1-NEXT: v_mov_b32_e32 v3, v2 6876; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 6877; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6878; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6879; GCN1-NEXT: buffer_wbinvl1_vol 6880; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6881; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6882; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 6883; GCN1-NEXT: s_cbranch_execnz .LBB126_1 6884; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6885; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] 6886; GCN1-NEXT: v_mov_b32_e32 v0, s2 6887; GCN1-NEXT: v_mov_b32_e32 v1, s3 6888; GCN1-NEXT: flat_store_dword v[0:1], v2 6889; GCN1-NEXT: s_endpgm 6890; 6891; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: 6892; GCN2: ; %bb.0: ; %entry 6893; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6894; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6895; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6896; GCN2-NEXT: s_ashr_i32 s5, s7, 31 6897; GCN2-NEXT: s_mov_b32 s4, s7 6898; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6899; GCN2-NEXT: s_add_u32 s0, s0, s4 6900; GCN2-NEXT: s_addc_u32 s1, s1, s5 6901; GCN2-NEXT: s_add_u32 s0, s0, 16 6902; GCN2-NEXT: s_addc_u32 s1, s1, 0 6903; GCN2-NEXT: v_mov_b32_e32 v0, s0 6904; GCN2-NEXT: v_mov_b32_e32 v1, s1 6905; GCN2-NEXT: flat_load_dword v2, v[0:1] 6906; GCN2-NEXT: s_mov_b64 s[0:1], 0 6907; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start 6908; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 6909; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6910; GCN2-NEXT: v_mov_b32_e32 v3, v2 6911; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 6912; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6913; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6914; GCN2-NEXT: buffer_wbinvl1_vol 6915; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6916; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6917; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 6918; GCN2-NEXT: s_cbranch_execnz .LBB126_1 6919; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 6920; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] 6921; GCN2-NEXT: v_mov_b32_e32 v0, s2 6922; GCN2-NEXT: v_mov_b32_e32 v1, s3 6923; GCN2-NEXT: flat_store_dword v[0:1], v2 6924; GCN2-NEXT: s_endpgm 6925; 6926; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: 6927; GCN3: ; %bb.0: ; %entry 6928; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6929; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6930; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6931; GCN3-NEXT: s_ashr_i32 s5, s7, 31 6932; GCN3-NEXT: s_mov_b32 s4, s7 6933; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 6934; GCN3-NEXT: s_add_u32 s0, s0, s4 6935; GCN3-NEXT: s_addc_u32 s1, s1, s5 6936; GCN3-NEXT: v_mov_b32_e32 v0, s0 6937; GCN3-NEXT: v_mov_b32_e32 v1, s1 6938; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 6939; GCN3-NEXT: s_mov_b64 s[0:1], 0 6940; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start 6941; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 6942; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6943; GCN3-NEXT: v_mov_b32_e32 v3, v2 6944; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 6945; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6946; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6947; GCN3-NEXT: buffer_wbinvl1_vol 6948; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6949; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6950; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 6951; GCN3-NEXT: s_cbranch_execnz .LBB126_1 6952; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 6953; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] 6954; GCN3-NEXT: v_mov_b32_e32 v0, s2 6955; GCN3-NEXT: v_mov_b32_e32 v1, s3 6956; GCN3-NEXT: flat_store_dword v[0:1], v2 6957; GCN3-NEXT: s_endpgm 6958entry: 6959 %ptr = getelementptr i32, ptr %out, i32 %index 6960 %gep = getelementptr i32, ptr %ptr, i32 4 6961 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst 6962 store i32 %tmp0, ptr %out2 6963 ret void 6964} 6965 6966define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { 6967; GCN1-LABEL: atomic_min_i32: 6968; GCN1: ; %bb.0: ; %entry 6969; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 6970; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 6971; GCN1-NEXT: s_mov_b64 s[0:1], 0 6972; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6973; GCN1-NEXT: v_mov_b32_e32 v0, s6 6974; GCN1-NEXT: v_mov_b32_e32 v1, s7 6975; GCN1-NEXT: flat_load_dword v3, v[0:1] 6976; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start 6977; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 6978; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6979; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 6980; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6981; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6982; GCN1-NEXT: buffer_wbinvl1_vol 6983; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 6984; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6985; GCN1-NEXT: v_mov_b32_e32 v3, v2 6986; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 6987; GCN1-NEXT: s_cbranch_execnz .LBB127_1 6988; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 6989; GCN1-NEXT: s_endpgm 6990; 6991; GCN2-LABEL: atomic_min_i32: 6992; GCN2: ; %bb.0: ; %entry 6993; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 6994; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 6995; GCN2-NEXT: s_mov_b64 s[0:1], 0 6996; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6997; GCN2-NEXT: v_mov_b32_e32 v0, s6 6998; GCN2-NEXT: v_mov_b32_e32 v1, s7 6999; GCN2-NEXT: flat_load_dword v3, v[0:1] 7000; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start 7001; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 7002; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7003; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 7004; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7005; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7006; GCN2-NEXT: buffer_wbinvl1_vol 7007; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7008; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7009; GCN2-NEXT: v_mov_b32_e32 v3, v2 7010; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 7011; GCN2-NEXT: s_cbranch_execnz .LBB127_1 7012; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 7013; GCN2-NEXT: s_endpgm 7014; 7015; GCN3-LABEL: atomic_min_i32: 7016; GCN3: ; %bb.0: ; %entry 7017; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 7018; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 7019; GCN3-NEXT: s_mov_b64 s[0:1], 0 7020; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7021; GCN3-NEXT: v_mov_b32_e32 v0, s6 7022; GCN3-NEXT: v_mov_b32_e32 v1, s7 7023; GCN3-NEXT: flat_load_dword v3, v[0:1] 7024; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start 7025; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 7026; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7027; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 7028; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7029; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7030; GCN3-NEXT: buffer_wbinvl1_vol 7031; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7032; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7033; GCN3-NEXT: v_mov_b32_e32 v3, v2 7034; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 7035; GCN3-NEXT: s_cbranch_execnz .LBB127_1 7036; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 7037; GCN3-NEXT: s_endpgm 7038entry: 7039 %tmp0 = atomicrmw min ptr %out, i32 %in seq_cst 7040 ret void 7041} 7042 7043define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { 7044; GCN1-LABEL: atomic_min_i32_ret_addr64: 7045; GCN1: ; %bb.0: ; %entry 7046; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 7047; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7048; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7049; GCN1-NEXT: s_ashr_i32 s5, s7, 31 7050; GCN1-NEXT: s_mov_b32 s4, s7 7051; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7052; GCN1-NEXT: s_add_u32 s0, s0, s4 7053; GCN1-NEXT: s_addc_u32 s1, s1, s5 7054; GCN1-NEXT: v_mov_b32_e32 v0, s0 7055; GCN1-NEXT: v_mov_b32_e32 v1, s1 7056; GCN1-NEXT: flat_load_dword v2, v[0:1] 7057; GCN1-NEXT: s_mov_b64 s[0:1], 0 7058; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start 7059; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 7060; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7061; GCN1-NEXT: v_mov_b32_e32 v3, v2 7062; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 7063; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7064; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7065; GCN1-NEXT: buffer_wbinvl1_vol 7066; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7067; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7068; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] 7069; GCN1-NEXT: s_cbranch_execnz .LBB128_1 7070; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 7071; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] 7072; GCN1-NEXT: v_mov_b32_e32 v0, s2 7073; GCN1-NEXT: v_mov_b32_e32 v1, s3 7074; GCN1-NEXT: flat_store_dword v[0:1], v2 7075; GCN1-NEXT: s_endpgm 7076; 7077; GCN2-LABEL: atomic_min_i32_ret_addr64: 7078; GCN2: ; %bb.0: ; %entry 7079; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 7080; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7081; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7082; GCN2-NEXT: s_ashr_i32 s5, s7, 31 7083; GCN2-NEXT: s_mov_b32 s4, s7 7084; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7085; GCN2-NEXT: s_add_u32 s0, s0, s4 7086; GCN2-NEXT: s_addc_u32 s1, s1, s5 7087; GCN2-NEXT: v_mov_b32_e32 v0, s0 7088; GCN2-NEXT: v_mov_b32_e32 v1, s1 7089; GCN2-NEXT: flat_load_dword v2, v[0:1] 7090; GCN2-NEXT: s_mov_b64 s[0:1], 0 7091; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start 7092; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 7093; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7094; GCN2-NEXT: v_mov_b32_e32 v3, v2 7095; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 7096; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7097; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7098; GCN2-NEXT: buffer_wbinvl1_vol 7099; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7100; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7101; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] 7102; GCN2-NEXT: s_cbranch_execnz .LBB128_1 7103; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 7104; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] 7105; GCN2-NEXT: v_mov_b32_e32 v0, s2 7106; GCN2-NEXT: v_mov_b32_e32 v1, s3 7107; GCN2-NEXT: flat_store_dword v[0:1], v2 7108; GCN2-NEXT: s_endpgm 7109; 7110; GCN3-LABEL: atomic_min_i32_ret_addr64: 7111; GCN3: ; %bb.0: ; %entry 7112; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 7113; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7114; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7115; GCN3-NEXT: s_ashr_i32 s5, s7, 31 7116; GCN3-NEXT: s_mov_b32 s4, s7 7117; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 7118; GCN3-NEXT: s_add_u32 s0, s0, s4 7119; GCN3-NEXT: s_addc_u32 s1, s1, s5 7120; GCN3-NEXT: v_mov_b32_e32 v0, s0 7121; GCN3-NEXT: v_mov_b32_e32 v1, s1 7122; GCN3-NEXT: flat_load_dword v2, v[0:1] 7123; GCN3-NEXT: s_mov_b64 s[0:1], 0 7124; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start 7125; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 7126; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7127; GCN3-NEXT: v_mov_b32_e32 v3, v2 7128; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 7129; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7130; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7131; GCN3-NEXT: buffer_wbinvl1_vol 7132; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7133; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7134; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] 7135; GCN3-NEXT: s_cbranch_execnz .LBB128_1 7136; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 7137; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] 7138; GCN3-NEXT: v_mov_b32_e32 v0, s2 7139; GCN3-NEXT: v_mov_b32_e32 v1, s3 7140; GCN3-NEXT: flat_store_dword v[0:1], v2 7141; GCN3-NEXT: s_endpgm 7142entry: 7143 %ptr = getelementptr i32, ptr %out, i32 %index 7144 %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst 7145 store i32 %tmp0, ptr %out2 7146 ret void 7147} 7148 7149define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 7150; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: 7151; GCN1: ; %bb.0: 7152; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7153; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7154; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7155; GCN1-NEXT: flat_load_dword v4, v[0:1] 7156; GCN1-NEXT: s_mov_b64 s[4:5], 0 7157; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start 7158; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 7159; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7160; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 7161; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7162; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7163; GCN1-NEXT: buffer_wbinvl1_vol 7164; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7165; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7166; GCN1-NEXT: v_mov_b32_e32 v4, v3 7167; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 7168; GCN1-NEXT: s_cbranch_execnz .LBB129_1 7169; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 7170; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 7171; GCN1-NEXT: s_setpc_b64 s[30:31] 7172; 7173; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: 7174; GCN2: ; %bb.0: 7175; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7176; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7177; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7178; GCN2-NEXT: flat_load_dword v4, v[0:1] 7179; GCN2-NEXT: s_mov_b64 s[4:5], 0 7180; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start 7181; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 7182; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7183; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 7184; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7185; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7186; GCN2-NEXT: buffer_wbinvl1_vol 7187; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7188; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7189; GCN2-NEXT: v_mov_b32_e32 v4, v3 7190; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 7191; GCN2-NEXT: s_cbranch_execnz .LBB129_1 7192; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 7193; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 7194; GCN2-NEXT: s_setpc_b64 s[30:31] 7195; 7196; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: 7197; GCN3: ; %bb.0: 7198; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7199; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 7200; GCN3-NEXT: s_mov_b64 s[4:5], 0 7201; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start 7202; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 7203; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7204; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 7205; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 7206; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7207; GCN3-NEXT: buffer_wbinvl1_vol 7208; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7209; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7210; GCN3-NEXT: v_mov_b32_e32 v4, v3 7211; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 7212; GCN3-NEXT: s_cbranch_execnz .LBB129_1 7213; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 7214; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 7215; GCN3-NEXT: s_setpc_b64 s[30:31] 7216 %gep = getelementptr i32, ptr %out, i64 4 7217 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7218 ret void 7219} 7220 7221define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 7222; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: 7223; GCN1: ; %bb.0: 7224; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7225; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 7226; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 7227; GCN1-NEXT: flat_load_dword v0, v[3:4] 7228; GCN1-NEXT: s_mov_b64 s[4:5], 0 7229; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start 7230; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 7231; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7232; GCN1-NEXT: v_mov_b32_e32 v1, v0 7233; GCN1-NEXT: v_min_i32_e32 v0, v1, v2 7234; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 7235; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7236; GCN1-NEXT: buffer_wbinvl1_vol 7237; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7238; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7239; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] 7240; GCN1-NEXT: s_cbranch_execnz .LBB130_1 7241; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end 7242; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] 7243; GCN1-NEXT: s_setpc_b64 s[30:31] 7244; 7245; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: 7246; GCN2: ; %bb.0: 7247; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7248; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 7249; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 7250; GCN2-NEXT: flat_load_dword v0, v[3:4] 7251; GCN2-NEXT: s_mov_b64 s[4:5], 0 7252; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start 7253; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 7254; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7255; GCN2-NEXT: v_mov_b32_e32 v1, v0 7256; GCN2-NEXT: v_min_i32_e32 v0, v1, v2 7257; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 7258; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7259; GCN2-NEXT: buffer_wbinvl1_vol 7260; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7261; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7262; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] 7263; GCN2-NEXT: s_cbranch_execnz .LBB130_1 7264; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end 7265; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] 7266; GCN2-NEXT: s_setpc_b64 s[30:31] 7267; 7268; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: 7269; GCN3: ; %bb.0: 7270; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7271; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 7272; GCN3-NEXT: s_mov_b64 s[4:5], 0 7273; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start 7274; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 7275; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7276; GCN3-NEXT: v_mov_b32_e32 v4, v3 7277; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 7278; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc 7279; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7280; GCN3-NEXT: buffer_wbinvl1_vol 7281; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7282; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7283; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] 7284; GCN3-NEXT: s_cbranch_execnz .LBB130_1 7285; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end 7286; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] 7287; GCN3-NEXT: v_mov_b32_e32 v0, v3 7288; GCN3-NEXT: s_setpc_b64 s[30:31] 7289 %gep = getelementptr i32, ptr %out, i64 4 7290 %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7291 ret i32 %result 7292} 7293 7294; --------------------------------------------------------------------- 7295; atomicrmw uinc_wrap 7296; --------------------------------------------------------------------- 7297 7298define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) { 7299; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret: 7300; GCN1: ; %bb.0: 7301; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7302; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7303; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7304; GCN1-NEXT: buffer_wbinvl1_vol 7305; GCN1-NEXT: s_setpc_b64 s[30:31] 7306; 7307; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret: 7308; GCN2: ; %bb.0: 7309; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7310; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7311; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7312; GCN2-NEXT: buffer_wbinvl1_vol 7313; GCN2-NEXT: s_setpc_b64 s[30:31] 7314; 7315; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret: 7316; GCN3: ; %bb.0: 7317; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7318; GCN3-NEXT: flat_atomic_inc v[0:1], v2 7319; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7320; GCN3-NEXT: buffer_wbinvl1_vol 7321; GCN3-NEXT: s_setpc_b64 s[30:31] 7322 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst 7323 ret void 7324} 7325 7326define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { 7327; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: 7328; GCN1: ; %bb.0: 7329; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7330; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7331; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7332; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7333; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7334; GCN1-NEXT: buffer_wbinvl1_vol 7335; GCN1-NEXT: s_setpc_b64 s[30:31] 7336; 7337; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: 7338; GCN2: ; %bb.0: 7339; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7340; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7341; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7342; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7343; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7344; GCN2-NEXT: buffer_wbinvl1_vol 7345; GCN2-NEXT: s_setpc_b64 s[30:31] 7346; 7347; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: 7348; GCN3: ; %bb.0: 7349; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7350; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 7351; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7352; GCN3-NEXT: buffer_wbinvl1_vol 7353; GCN3-NEXT: s_setpc_b64 s[30:31] 7354 %gep = getelementptr i32, ptr %out, i32 4 7355 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst 7356 ret void 7357} 7358 7359define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) { 7360; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret: 7361; GCN1: ; %bb.0: 7362; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7363; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7364; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7365; GCN1-NEXT: buffer_wbinvl1_vol 7366; GCN1-NEXT: s_setpc_b64 s[30:31] 7367; 7368; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret: 7369; GCN2: ; %bb.0: 7370; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7371; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7372; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7373; GCN2-NEXT: buffer_wbinvl1_vol 7374; GCN2-NEXT: s_setpc_b64 s[30:31] 7375; 7376; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret: 7377; GCN3: ; %bb.0: 7378; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7379; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7380; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7381; GCN3-NEXT: buffer_wbinvl1_vol 7382; GCN3-NEXT: s_setpc_b64 s[30:31] 7383 %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst 7384 ret i32 %result 7385} 7386 7387define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { 7388; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: 7389; GCN1: ; %bb.0: 7390; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7391; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7392; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7393; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7394; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7395; GCN1-NEXT: buffer_wbinvl1_vol 7396; GCN1-NEXT: s_setpc_b64 s[30:31] 7397; 7398; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: 7399; GCN2: ; %bb.0: 7400; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7401; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7402; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7403; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7404; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7405; GCN2-NEXT: buffer_wbinvl1_vol 7406; GCN2-NEXT: s_setpc_b64 s[30:31] 7407; 7408; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: 7409; GCN3: ; %bb.0: 7410; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7411; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc 7412; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7413; GCN3-NEXT: buffer_wbinvl1_vol 7414; GCN3-NEXT: s_setpc_b64 s[30:31] 7415 %gep = getelementptr i32, ptr %out, i32 4 7416 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst 7417 ret i32 %result 7418} 7419 7420define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 7421; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: 7422; GCN1: ; %bb.0: 7423; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7424; GCN1-NEXT: v_mov_b32_e32 v0, s4 7425; GCN1-NEXT: v_mov_b32_e32 v1, s5 7426; GCN1-NEXT: v_mov_b32_e32 v2, s6 7427; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7428; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7429; GCN1-NEXT: buffer_wbinvl1_vol 7430; GCN1-NEXT: s_setpc_b64 s[30:31] 7431; 7432; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: 7433; GCN2: ; %bb.0: 7434; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7435; GCN2-NEXT: v_mov_b32_e32 v0, s4 7436; GCN2-NEXT: v_mov_b32_e32 v1, s5 7437; GCN2-NEXT: v_mov_b32_e32 v2, s6 7438; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7439; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7440; GCN2-NEXT: buffer_wbinvl1_vol 7441; GCN2-NEXT: s_setpc_b64 s[30:31] 7442; 7443; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: 7444; GCN3: ; %bb.0: 7445; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7446; GCN3-NEXT: v_mov_b32_e32 v0, s4 7447; GCN3-NEXT: v_mov_b32_e32 v1, s5 7448; GCN3-NEXT: v_mov_b32_e32 v2, s6 7449; GCN3-NEXT: flat_atomic_inc v[0:1], v2 7450; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7451; GCN3-NEXT: buffer_wbinvl1_vol 7452; GCN3-NEXT: s_setpc_b64 s[30:31] 7453 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst 7454 ret void 7455} 7456 7457define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 7458; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: 7459; GCN1: ; %bb.0: 7460; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7461; GCN1-NEXT: s_add_u32 s34, s4, 16 7462; GCN1-NEXT: s_addc_u32 s35, s5, 0 7463; GCN1-NEXT: v_mov_b32_e32 v0, s34 7464; GCN1-NEXT: v_mov_b32_e32 v1, s35 7465; GCN1-NEXT: v_mov_b32_e32 v2, s6 7466; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7467; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7468; GCN1-NEXT: buffer_wbinvl1_vol 7469; GCN1-NEXT: s_setpc_b64 s[30:31] 7470; 7471; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: 7472; GCN2: ; %bb.0: 7473; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7474; GCN2-NEXT: s_add_u32 s34, s4, 16 7475; GCN2-NEXT: s_addc_u32 s35, s5, 0 7476; GCN2-NEXT: v_mov_b32_e32 v0, s34 7477; GCN2-NEXT: v_mov_b32_e32 v1, s35 7478; GCN2-NEXT: v_mov_b32_e32 v2, s6 7479; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7480; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7481; GCN2-NEXT: buffer_wbinvl1_vol 7482; GCN2-NEXT: s_setpc_b64 s[30:31] 7483; 7484; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: 7485; GCN3: ; %bb.0: 7486; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7487; GCN3-NEXT: v_mov_b32_e32 v0, s4 7488; GCN3-NEXT: v_mov_b32_e32 v1, s5 7489; GCN3-NEXT: v_mov_b32_e32 v2, s6 7490; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 7491; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7492; GCN3-NEXT: buffer_wbinvl1_vol 7493; GCN3-NEXT: s_setpc_b64 s[30:31] 7494 %gep = getelementptr i32, ptr %out, i32 4 7495 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst 7496 ret void 7497} 7498 7499define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 7500; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: 7501; GCN1: ; %bb.0: 7502; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7503; GCN1-NEXT: v_mov_b32_e32 v0, s4 7504; GCN1-NEXT: v_mov_b32_e32 v1, s5 7505; GCN1-NEXT: v_mov_b32_e32 v2, s6 7506; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7507; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7508; GCN1-NEXT: buffer_wbinvl1_vol 7509; GCN1-NEXT: s_setpc_b64 s[30:31] 7510; 7511; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: 7512; GCN2: ; %bb.0: 7513; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7514; GCN2-NEXT: v_mov_b32_e32 v0, s4 7515; GCN2-NEXT: v_mov_b32_e32 v1, s5 7516; GCN2-NEXT: v_mov_b32_e32 v2, s6 7517; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7518; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7519; GCN2-NEXT: buffer_wbinvl1_vol 7520; GCN2-NEXT: s_setpc_b64 s[30:31] 7521; 7522; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: 7523; GCN3: ; %bb.0: 7524; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7525; GCN3-NEXT: v_mov_b32_e32 v0, s4 7526; GCN3-NEXT: v_mov_b32_e32 v1, s5 7527; GCN3-NEXT: v_mov_b32_e32 v2, s6 7528; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7529; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7530; GCN3-NEXT: buffer_wbinvl1_vol 7531; GCN3-NEXT: s_setpc_b64 s[30:31] 7532 %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst 7533 ret i32 %result 7534} 7535 7536define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 7537; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: 7538; GCN1: ; %bb.0: 7539; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7540; GCN1-NEXT: s_add_u32 s34, s4, 16 7541; GCN1-NEXT: s_addc_u32 s35, s5, 0 7542; GCN1-NEXT: v_mov_b32_e32 v0, s34 7543; GCN1-NEXT: v_mov_b32_e32 v1, s35 7544; GCN1-NEXT: v_mov_b32_e32 v2, s6 7545; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7546; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7547; GCN1-NEXT: buffer_wbinvl1_vol 7548; GCN1-NEXT: s_setpc_b64 s[30:31] 7549; 7550; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: 7551; GCN2: ; %bb.0: 7552; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7553; GCN2-NEXT: s_add_u32 s34, s4, 16 7554; GCN2-NEXT: s_addc_u32 s35, s5, 0 7555; GCN2-NEXT: v_mov_b32_e32 v0, s34 7556; GCN2-NEXT: v_mov_b32_e32 v1, s35 7557; GCN2-NEXT: v_mov_b32_e32 v2, s6 7558; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7559; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7560; GCN2-NEXT: buffer_wbinvl1_vol 7561; GCN2-NEXT: s_setpc_b64 s[30:31] 7562; 7563; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: 7564; GCN3: ; %bb.0: 7565; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7566; GCN3-NEXT: v_mov_b32_e32 v0, s4 7567; GCN3-NEXT: v_mov_b32_e32 v1, s5 7568; GCN3-NEXT: v_mov_b32_e32 v2, s6 7569; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc 7570; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7571; GCN3-NEXT: buffer_wbinvl1_vol 7572; GCN3-NEXT: s_setpc_b64 s[30:31] 7573 %gep = getelementptr i32, ptr %out, i32 4 7574 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst 7575 ret i32 %result 7576} 7577 7578define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 7579; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: 7580; GCN1: ; %bb.0: 7581; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7582; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7583; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7584; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7585; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7586; GCN1-NEXT: buffer_wbinvl1_vol 7587; GCN1-NEXT: s_setpc_b64 s[30:31] 7588; 7589; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: 7590; GCN2: ; %bb.0: 7591; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7592; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7593; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7594; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7595; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7596; GCN2-NEXT: buffer_wbinvl1_vol 7597; GCN2-NEXT: s_setpc_b64 s[30:31] 7598; 7599; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: 7600; GCN3: ; %bb.0: 7601; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7602; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 7603; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7604; GCN3-NEXT: buffer_wbinvl1_vol 7605; GCN3-NEXT: s_setpc_b64 s[30:31] 7606 %gep = getelementptr i32, ptr %out, i64 4 7607 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7608 ret void 7609} 7610 7611define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 7612; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: 7613; GCN1: ; %bb.0: 7614; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7615; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7616; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7617; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7618; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7619; GCN1-NEXT: buffer_wbinvl1_vol 7620; GCN1-NEXT: s_setpc_b64 s[30:31] 7621; 7622; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: 7623; GCN2: ; %bb.0: 7624; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7625; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7626; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7627; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 7628; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7629; GCN2-NEXT: buffer_wbinvl1_vol 7630; GCN2-NEXT: s_setpc_b64 s[30:31] 7631; 7632; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: 7633; GCN3: ; %bb.0: 7634; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7635; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc 7636; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7637; GCN3-NEXT: buffer_wbinvl1_vol 7638; GCN3-NEXT: s_setpc_b64 s[30:31] 7639 %gep = getelementptr i32, ptr %out, i64 4 7640 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7641 ret i32 %result 7642} 7643 7644; --------------------------------------------------------------------- 7645; atomicrmw udec_wrap 7646; --------------------------------------------------------------------- 7647 7648define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { 7649; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret: 7650; GCN1: ; %bb.0: 7651; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7652; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7653; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7654; GCN1-NEXT: buffer_wbinvl1_vol 7655; GCN1-NEXT: s_setpc_b64 s[30:31] 7656; 7657; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret: 7658; GCN2: ; %bb.0: 7659; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7660; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7661; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7662; GCN2-NEXT: buffer_wbinvl1_vol 7663; GCN2-NEXT: s_setpc_b64 s[30:31] 7664; 7665; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret: 7666; GCN3: ; %bb.0: 7667; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7668; GCN3-NEXT: flat_atomic_dec v[0:1], v2 7669; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7670; GCN3-NEXT: buffer_wbinvl1_vol 7671; GCN3-NEXT: s_setpc_b64 s[30:31] 7672 %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst 7673 ret void 7674} 7675 7676define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { 7677; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset: 7678; GCN1: ; %bb.0: 7679; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7680; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7681; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7682; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7683; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7684; GCN1-NEXT: buffer_wbinvl1_vol 7685; GCN1-NEXT: s_setpc_b64 s[30:31] 7686; 7687; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset: 7688; GCN2: ; %bb.0: 7689; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7690; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7691; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7692; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7693; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7694; GCN2-NEXT: buffer_wbinvl1_vol 7695; GCN2-NEXT: s_setpc_b64 s[30:31] 7696; 7697; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset: 7698; GCN3: ; %bb.0: 7699; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7700; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 7701; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7702; GCN3-NEXT: buffer_wbinvl1_vol 7703; GCN3-NEXT: s_setpc_b64 s[30:31] 7704 %gep = getelementptr i32, ptr %out, i32 4 7705 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst 7706 ret void 7707} 7708 7709define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { 7710; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret: 7711; GCN1: ; %bb.0: 7712; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7713; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7714; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7715; GCN1-NEXT: buffer_wbinvl1_vol 7716; GCN1-NEXT: s_setpc_b64 s[30:31] 7717; 7718; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret: 7719; GCN2: ; %bb.0: 7720; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7721; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7722; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7723; GCN2-NEXT: buffer_wbinvl1_vol 7724; GCN2-NEXT: s_setpc_b64 s[30:31] 7725; 7726; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret: 7727; GCN3: ; %bb.0: 7728; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7729; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7730; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7731; GCN3-NEXT: buffer_wbinvl1_vol 7732; GCN3-NEXT: s_setpc_b64 s[30:31] 7733 %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst 7734 ret i32 %result 7735} 7736 7737define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { 7738; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset: 7739; GCN1: ; %bb.0: 7740; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7741; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7742; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7743; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7744; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7745; GCN1-NEXT: buffer_wbinvl1_vol 7746; GCN1-NEXT: s_setpc_b64 s[30:31] 7747; 7748; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset: 7749; GCN2: ; %bb.0: 7750; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7751; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7752; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7753; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7754; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7755; GCN2-NEXT: buffer_wbinvl1_vol 7756; GCN2-NEXT: s_setpc_b64 s[30:31] 7757; 7758; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset: 7759; GCN3: ; %bb.0: 7760; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7761; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc 7762; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7763; GCN3-NEXT: buffer_wbinvl1_vol 7764; GCN3-NEXT: s_setpc_b64 s[30:31] 7765 %gep = getelementptr i32, ptr %out, i32 4 7766 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst 7767 ret i32 %result 7768} 7769 7770define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) { 7771; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: 7772; GCN1: ; %bb.0: 7773; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7774; GCN1-NEXT: v_mov_b32_e32 v0, s4 7775; GCN1-NEXT: v_mov_b32_e32 v1, s5 7776; GCN1-NEXT: v_mov_b32_e32 v2, s6 7777; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7778; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7779; GCN1-NEXT: buffer_wbinvl1_vol 7780; GCN1-NEXT: s_setpc_b64 s[30:31] 7781; 7782; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: 7783; GCN2: ; %bb.0: 7784; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7785; GCN2-NEXT: v_mov_b32_e32 v0, s4 7786; GCN2-NEXT: v_mov_b32_e32 v1, s5 7787; GCN2-NEXT: v_mov_b32_e32 v2, s6 7788; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7789; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7790; GCN2-NEXT: buffer_wbinvl1_vol 7791; GCN2-NEXT: s_setpc_b64 s[30:31] 7792; 7793; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: 7794; GCN3: ; %bb.0: 7795; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7796; GCN3-NEXT: v_mov_b32_e32 v0, s4 7797; GCN3-NEXT: v_mov_b32_e32 v1, s5 7798; GCN3-NEXT: v_mov_b32_e32 v2, s6 7799; GCN3-NEXT: flat_atomic_dec v[0:1], v2 7800; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7801; GCN3-NEXT: buffer_wbinvl1_vol 7802; GCN3-NEXT: s_setpc_b64 s[30:31] 7803 %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst 7804 ret void 7805} 7806 7807define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) { 7808; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: 7809; GCN1: ; %bb.0: 7810; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7811; GCN1-NEXT: s_add_u32 s34, s4, 16 7812; GCN1-NEXT: s_addc_u32 s35, s5, 0 7813; GCN1-NEXT: v_mov_b32_e32 v0, s34 7814; GCN1-NEXT: v_mov_b32_e32 v1, s35 7815; GCN1-NEXT: v_mov_b32_e32 v2, s6 7816; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7817; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7818; GCN1-NEXT: buffer_wbinvl1_vol 7819; GCN1-NEXT: s_setpc_b64 s[30:31] 7820; 7821; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: 7822; GCN2: ; %bb.0: 7823; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7824; GCN2-NEXT: s_add_u32 s34, s4, 16 7825; GCN2-NEXT: s_addc_u32 s35, s5, 0 7826; GCN2-NEXT: v_mov_b32_e32 v0, s34 7827; GCN2-NEXT: v_mov_b32_e32 v1, s35 7828; GCN2-NEXT: v_mov_b32_e32 v2, s6 7829; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7830; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7831; GCN2-NEXT: buffer_wbinvl1_vol 7832; GCN2-NEXT: s_setpc_b64 s[30:31] 7833; 7834; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: 7835; GCN3: ; %bb.0: 7836; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7837; GCN3-NEXT: v_mov_b32_e32 v0, s4 7838; GCN3-NEXT: v_mov_b32_e32 v1, s5 7839; GCN3-NEXT: v_mov_b32_e32 v2, s6 7840; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 7841; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7842; GCN3-NEXT: buffer_wbinvl1_vol 7843; GCN3-NEXT: s_setpc_b64 s[30:31] 7844 %gep = getelementptr i32, ptr %out, i32 4 7845 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst 7846 ret void 7847} 7848 7849define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) { 7850; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: 7851; GCN1: ; %bb.0: 7852; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7853; GCN1-NEXT: v_mov_b32_e32 v0, s4 7854; GCN1-NEXT: v_mov_b32_e32 v1, s5 7855; GCN1-NEXT: v_mov_b32_e32 v2, s6 7856; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7857; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7858; GCN1-NEXT: buffer_wbinvl1_vol 7859; GCN1-NEXT: s_setpc_b64 s[30:31] 7860; 7861; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: 7862; GCN2: ; %bb.0: 7863; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7864; GCN2-NEXT: v_mov_b32_e32 v0, s4 7865; GCN2-NEXT: v_mov_b32_e32 v1, s5 7866; GCN2-NEXT: v_mov_b32_e32 v2, s6 7867; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7868; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7869; GCN2-NEXT: buffer_wbinvl1_vol 7870; GCN2-NEXT: s_setpc_b64 s[30:31] 7871; 7872; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: 7873; GCN3: ; %bb.0: 7874; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7875; GCN3-NEXT: v_mov_b32_e32 v0, s4 7876; GCN3-NEXT: v_mov_b32_e32 v1, s5 7877; GCN3-NEXT: v_mov_b32_e32 v2, s6 7878; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7879; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7880; GCN3-NEXT: buffer_wbinvl1_vol 7881; GCN3-NEXT: s_setpc_b64 s[30:31] 7882 %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst 7883 ret i32 %result 7884} 7885 7886define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) { 7887; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: 7888; GCN1: ; %bb.0: 7889; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7890; GCN1-NEXT: s_add_u32 s34, s4, 16 7891; GCN1-NEXT: s_addc_u32 s35, s5, 0 7892; GCN1-NEXT: v_mov_b32_e32 v0, s34 7893; GCN1-NEXT: v_mov_b32_e32 v1, s35 7894; GCN1-NEXT: v_mov_b32_e32 v2, s6 7895; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7896; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7897; GCN1-NEXT: buffer_wbinvl1_vol 7898; GCN1-NEXT: s_setpc_b64 s[30:31] 7899; 7900; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: 7901; GCN2: ; %bb.0: 7902; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7903; GCN2-NEXT: s_add_u32 s34, s4, 16 7904; GCN2-NEXT: s_addc_u32 s35, s5, 0 7905; GCN2-NEXT: v_mov_b32_e32 v0, s34 7906; GCN2-NEXT: v_mov_b32_e32 v1, s35 7907; GCN2-NEXT: v_mov_b32_e32 v2, s6 7908; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7909; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7910; GCN2-NEXT: buffer_wbinvl1_vol 7911; GCN2-NEXT: s_setpc_b64 s[30:31] 7912; 7913; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: 7914; GCN3: ; %bb.0: 7915; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7916; GCN3-NEXT: v_mov_b32_e32 v0, s4 7917; GCN3-NEXT: v_mov_b32_e32 v1, s5 7918; GCN3-NEXT: v_mov_b32_e32 v2, s6 7919; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc 7920; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7921; GCN3-NEXT: buffer_wbinvl1_vol 7922; GCN3-NEXT: s_setpc_b64 s[30:31] 7923 %gep = getelementptr i32, ptr %out, i32 4 7924 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst 7925 ret i32 %result 7926} 7927 7928define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 7929; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: 7930; GCN1: ; %bb.0: 7931; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7932; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7933; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7934; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7935; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7936; GCN1-NEXT: buffer_wbinvl1_vol 7937; GCN1-NEXT: s_setpc_b64 s[30:31] 7938; 7939; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: 7940; GCN2: ; %bb.0: 7941; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7942; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7943; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7944; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7945; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7946; GCN2-NEXT: buffer_wbinvl1_vol 7947; GCN2-NEXT: s_setpc_b64 s[30:31] 7948; 7949; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: 7950; GCN3: ; %bb.0: 7951; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7952; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 7953; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7954; GCN3-NEXT: buffer_wbinvl1_vol 7955; GCN3-NEXT: s_setpc_b64 s[30:31] 7956 %gep = getelementptr i32, ptr %out, i64 4 7957 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7958 ret void 7959} 7960 7961define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) { 7962; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: 7963; GCN1: ; %bb.0: 7964; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7965; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 7966; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7967; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7968; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7969; GCN1-NEXT: buffer_wbinvl1_vol 7970; GCN1-NEXT: s_setpc_b64 s[30:31] 7971; 7972; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: 7973; GCN2: ; %bb.0: 7974; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7975; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7976; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7977; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc 7978; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7979; GCN2-NEXT: buffer_wbinvl1_vol 7980; GCN2-NEXT: s_setpc_b64 s[30:31] 7981; 7982; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: 7983; GCN3: ; %bb.0: 7984; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7985; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc 7986; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7987; GCN3-NEXT: buffer_wbinvl1_vol 7988; GCN3-NEXT: s_setpc_b64 s[30:31] 7989 %gep = getelementptr i32, ptr %out, i64 4 7990 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 7991 ret i32 %result 7992} 7993 7994!0 = !{} 7995