1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s 5 6define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { 7; GCN1-LABEL: atomic_add_i32_offset: 8; GCN1: ; %bb.0: ; %entry 9; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 10; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 11; GCN1-NEXT: s_waitcnt lgkmcnt(0) 12; GCN1-NEXT: s_add_u32 s0, s0, 16 13; GCN1-NEXT: s_addc_u32 s1, s1, 0 14; GCN1-NEXT: v_mov_b32_e32 v0, s0 15; GCN1-NEXT: v_mov_b32_e32 v1, s1 16; GCN1-NEXT: v_mov_b32_e32 v2, s2 17; GCN1-NEXT: flat_atomic_add v[0:1], v2 18; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19; GCN1-NEXT: buffer_wbinvl1_vol 20; GCN1-NEXT: s_endpgm 21; 22; GCN2-LABEL: atomic_add_i32_offset: 23; GCN2: ; %bb.0: ; %entry 24; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 25; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 26; GCN2-NEXT: s_waitcnt lgkmcnt(0) 27; GCN2-NEXT: s_add_u32 s0, s0, 16 28; GCN2-NEXT: s_addc_u32 s1, s1, 0 29; GCN2-NEXT: v_mov_b32_e32 v0, s0 30; GCN2-NEXT: v_mov_b32_e32 v1, s1 31; GCN2-NEXT: v_mov_b32_e32 v2, s2 32; GCN2-NEXT: flat_atomic_add v[0:1], v2 33; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 34; GCN2-NEXT: buffer_wbinvl1_vol 35; GCN2-NEXT: s_endpgm 36; 37; GCN3-LABEL: atomic_add_i32_offset: 38; GCN3: ; %bb.0: ; %entry 39; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 40; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 41; GCN3-NEXT: s_waitcnt lgkmcnt(0) 42; GCN3-NEXT: v_mov_b32_e32 v0, s0 43; GCN3-NEXT: v_mov_b32_e32 v1, s1 44; GCN3-NEXT: v_mov_b32_e32 v2, s2 45; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 46; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 47; GCN3-NEXT: buffer_wbinvl1_vol 48; GCN3-NEXT: s_endpgm 49entry: 50 %gep = getelementptr i32, ptr %out, i32 4 51 %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst 52 ret void 53} 54 55define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { 56; GCN1-LABEL: atomic_add_i32_max_offset: 57; GCN1: ; %bb.0: ; %entry 58; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 59; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 60; GCN1-NEXT: s_waitcnt lgkmcnt(0) 61; GCN1-NEXT: s_add_u32 s0, s0, 0xffc 62; GCN1-NEXT: s_addc_u32 s1, s1, 0 63; GCN1-NEXT: v_mov_b32_e32 v0, s0 64; GCN1-NEXT: v_mov_b32_e32 v1, s1 65; GCN1-NEXT: v_mov_b32_e32 v2, s2 66; GCN1-NEXT: flat_atomic_add v[0:1], v2 67; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 68; GCN1-NEXT: buffer_wbinvl1_vol 69; GCN1-NEXT: s_endpgm 70; 71; GCN2-LABEL: atomic_add_i32_max_offset: 72; GCN2: ; %bb.0: ; %entry 73; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 74; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 75; GCN2-NEXT: s_waitcnt lgkmcnt(0) 76; GCN2-NEXT: s_add_u32 s0, s0, 0xffc 77; GCN2-NEXT: s_addc_u32 s1, s1, 0 78; GCN2-NEXT: v_mov_b32_e32 v0, s0 79; GCN2-NEXT: v_mov_b32_e32 v1, s1 80; GCN2-NEXT: v_mov_b32_e32 v2, s2 81; GCN2-NEXT: flat_atomic_add v[0:1], v2 82; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 83; GCN2-NEXT: buffer_wbinvl1_vol 84; GCN2-NEXT: s_endpgm 85; 86; GCN3-LABEL: atomic_add_i32_max_offset: 87; GCN3: ; %bb.0: ; %entry 88; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 89; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 90; GCN3-NEXT: s_waitcnt lgkmcnt(0) 91; GCN3-NEXT: v_mov_b32_e32 v0, s0 92; GCN3-NEXT: v_mov_b32_e32 v1, s1 93; GCN3-NEXT: v_mov_b32_e32 v2, s2 94; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 95; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 96; GCN3-NEXT: buffer_wbinvl1_vol 97; GCN3-NEXT: s_endpgm 98entry: 99 %gep = getelementptr i32, ptr %out, i32 1023 100 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst 101 ret void 102} 103 104define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { 105; GCN1-LABEL: atomic_add_i32_max_offset_p1: 106; GCN1: ; %bb.0: ; %entry 107; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 108; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 109; GCN1-NEXT: s_waitcnt lgkmcnt(0) 110; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 111; GCN1-NEXT: s_addc_u32 s1, s1, 0 112; GCN1-NEXT: v_mov_b32_e32 v0, s0 113; GCN1-NEXT: v_mov_b32_e32 v1, s1 114; GCN1-NEXT: v_mov_b32_e32 v2, s2 115; GCN1-NEXT: flat_atomic_add v[0:1], v2 116; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 117; GCN1-NEXT: buffer_wbinvl1_vol 118; GCN1-NEXT: s_endpgm 119; 120; GCN2-LABEL: atomic_add_i32_max_offset_p1: 121; GCN2: ; %bb.0: ; %entry 122; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 123; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 124; GCN2-NEXT: s_waitcnt lgkmcnt(0) 125; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 126; GCN2-NEXT: s_addc_u32 s1, s1, 0 127; GCN2-NEXT: v_mov_b32_e32 v0, s0 128; GCN2-NEXT: v_mov_b32_e32 v1, s1 129; GCN2-NEXT: v_mov_b32_e32 v2, s2 130; GCN2-NEXT: flat_atomic_add v[0:1], v2 131; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 132; GCN2-NEXT: buffer_wbinvl1_vol 133; GCN2-NEXT: s_endpgm 134; 135; GCN3-LABEL: atomic_add_i32_max_offset_p1: 136; GCN3: ; %bb.0: ; %entry 137; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 138; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 139; GCN3-NEXT: s_waitcnt lgkmcnt(0) 140; GCN3-NEXT: v_mov_b32_e32 v0, s0 141; GCN3-NEXT: v_mov_b32_e32 v1, s1 142; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 143; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 144; GCN3-NEXT: v_mov_b32_e32 v2, s2 145; GCN3-NEXT: flat_atomic_add v[0:1], v2 146; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 147; GCN3-NEXT: buffer_wbinvl1_vol 148; GCN3-NEXT: s_endpgm 149entry: 150 %gep = getelementptr i32, ptr %out, i32 1024 151 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst 152 ret void 153} 154 155define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 156; GCN1-LABEL: atomic_add_i32_ret_offset: 157; GCN1: ; %bb.0: ; %entry 158; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 159; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 160; GCN1-NEXT: s_waitcnt lgkmcnt(0) 161; GCN1-NEXT: s_add_u32 s0, s0, 16 162; GCN1-NEXT: s_addc_u32 s1, s1, 0 163; GCN1-NEXT: v_mov_b32_e32 v0, s0 164; GCN1-NEXT: v_mov_b32_e32 v1, s1 165; GCN1-NEXT: v_mov_b32_e32 v2, s4 166; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc 167; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 168; GCN1-NEXT: buffer_wbinvl1_vol 169; GCN1-NEXT: v_mov_b32_e32 v0, s2 170; GCN1-NEXT: v_mov_b32_e32 v1, s3 171; GCN1-NEXT: flat_store_dword v[0:1], v2 172; GCN1-NEXT: s_endpgm 173; 174; GCN2-LABEL: atomic_add_i32_ret_offset: 175; GCN2: ; %bb.0: ; %entry 176; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 177; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 178; GCN2-NEXT: s_waitcnt lgkmcnt(0) 179; GCN2-NEXT: s_add_u32 s0, s0, 16 180; GCN2-NEXT: s_addc_u32 s1, s1, 0 181; GCN2-NEXT: v_mov_b32_e32 v0, s0 182; GCN2-NEXT: v_mov_b32_e32 v1, s1 183; GCN2-NEXT: v_mov_b32_e32 v2, s4 184; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc 185; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 186; GCN2-NEXT: buffer_wbinvl1_vol 187; GCN2-NEXT: v_mov_b32_e32 v0, s2 188; GCN2-NEXT: v_mov_b32_e32 v1, s3 189; GCN2-NEXT: flat_store_dword v[0:1], v2 190; GCN2-NEXT: s_endpgm 191; 192; GCN3-LABEL: atomic_add_i32_ret_offset: 193; GCN3: ; %bb.0: ; %entry 194; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 195; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 196; GCN3-NEXT: s_waitcnt lgkmcnt(0) 197; GCN3-NEXT: v_mov_b32_e32 v0, s0 198; GCN3-NEXT: v_mov_b32_e32 v1, s1 199; GCN3-NEXT: v_mov_b32_e32 v2, s6 200; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc 201; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 202; GCN3-NEXT: buffer_wbinvl1_vol 203; GCN3-NEXT: v_mov_b32_e32 v0, s2 204; GCN3-NEXT: v_mov_b32_e32 v1, s3 205; GCN3-NEXT: flat_store_dword v[0:1], v2 206; GCN3-NEXT: s_endpgm 207entry: 208 %gep = getelementptr i32, ptr %out, i32 4 209 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst 210 store i32 %val, ptr %out2 211 ret void 212} 213 214define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 215; GCN1-LABEL: atomic_add_i32_addr64_offset: 216; GCN1: ; %bb.0: ; %entry 217; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 218; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 219; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 220; GCN1-NEXT: s_waitcnt lgkmcnt(0) 221; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 222; GCN1-NEXT: s_add_u32 s0, s2, s0 223; GCN1-NEXT: s_addc_u32 s1, s3, s1 224; GCN1-NEXT: s_add_u32 s0, s0, 16 225; GCN1-NEXT: s_addc_u32 s1, s1, 0 226; GCN1-NEXT: v_mov_b32_e32 v0, s0 227; GCN1-NEXT: v_mov_b32_e32 v1, s1 228; GCN1-NEXT: v_mov_b32_e32 v2, s4 229; GCN1-NEXT: flat_atomic_add v[0:1], v2 230; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 231; GCN1-NEXT: buffer_wbinvl1_vol 232; GCN1-NEXT: s_endpgm 233; 234; GCN2-LABEL: atomic_add_i32_addr64_offset: 235; GCN2: ; %bb.0: ; %entry 236; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 237; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 238; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 239; GCN2-NEXT: s_waitcnt lgkmcnt(0) 240; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 241; GCN2-NEXT: s_add_u32 s0, s2, s0 242; GCN2-NEXT: s_addc_u32 s1, s3, s1 243; GCN2-NEXT: s_add_u32 s0, s0, 16 244; GCN2-NEXT: s_addc_u32 s1, s1, 0 245; GCN2-NEXT: v_mov_b32_e32 v0, s0 246; GCN2-NEXT: v_mov_b32_e32 v1, s1 247; GCN2-NEXT: v_mov_b32_e32 v2, s4 248; GCN2-NEXT: flat_atomic_add v[0:1], v2 249; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 250; GCN2-NEXT: buffer_wbinvl1_vol 251; GCN2-NEXT: s_endpgm 252; 253; GCN3-LABEL: atomic_add_i32_addr64_offset: 254; GCN3: ; %bb.0: ; %entry 255; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 256; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 257; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 258; GCN3-NEXT: s_waitcnt lgkmcnt(0) 259; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 260; GCN3-NEXT: s_add_u32 s0, s2, s0 261; GCN3-NEXT: s_addc_u32 s1, s3, s1 262; GCN3-NEXT: v_mov_b32_e32 v0, s0 263; GCN3-NEXT: v_mov_b32_e32 v1, s1 264; GCN3-NEXT: v_mov_b32_e32 v2, s6 265; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 266; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 267; GCN3-NEXT: buffer_wbinvl1_vol 268; GCN3-NEXT: s_endpgm 269entry: 270 %ptr = getelementptr i32, ptr %out, i64 %index 271 %gep = getelementptr i32, ptr %ptr, i32 4 272 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst 273 ret void 274} 275 276define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 277; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: 278; GCN1: ; %bb.0: ; %entry 279; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 280; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 281; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 282; GCN1-NEXT: s_waitcnt lgkmcnt(0) 283; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 284; GCN1-NEXT: s_add_u32 s0, s0, s4 285; GCN1-NEXT: s_addc_u32 s1, s1, s5 286; GCN1-NEXT: s_add_u32 s0, s0, 16 287; GCN1-NEXT: s_addc_u32 s1, s1, 0 288; GCN1-NEXT: v_mov_b32_e32 v0, s0 289; GCN1-NEXT: v_mov_b32_e32 v1, s1 290; GCN1-NEXT: v_mov_b32_e32 v2, s8 291; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc 292; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 293; GCN1-NEXT: buffer_wbinvl1_vol 294; GCN1-NEXT: v_mov_b32_e32 v0, s2 295; GCN1-NEXT: v_mov_b32_e32 v1, s3 296; GCN1-NEXT: flat_store_dword v[0:1], v2 297; GCN1-NEXT: s_endpgm 298; 299; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: 300; GCN2: ; %bb.0: ; %entry 301; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 302; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 303; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 304; GCN2-NEXT: s_waitcnt lgkmcnt(0) 305; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 306; GCN2-NEXT: s_add_u32 s0, s0, s4 307; GCN2-NEXT: s_addc_u32 s1, s1, s5 308; GCN2-NEXT: s_add_u32 s0, s0, 16 309; GCN2-NEXT: s_addc_u32 s1, s1, 0 310; GCN2-NEXT: v_mov_b32_e32 v0, s0 311; GCN2-NEXT: v_mov_b32_e32 v1, s1 312; GCN2-NEXT: v_mov_b32_e32 v2, s8 313; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc 314; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 315; GCN2-NEXT: buffer_wbinvl1_vol 316; GCN2-NEXT: v_mov_b32_e32 v0, s2 317; GCN2-NEXT: v_mov_b32_e32 v1, s3 318; GCN2-NEXT: flat_store_dword v[0:1], v2 319; GCN2-NEXT: s_endpgm 320; 321; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: 322; GCN3: ; %bb.0: ; %entry 323; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 324; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 325; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 326; GCN3-NEXT: s_waitcnt lgkmcnt(0) 327; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 328; GCN3-NEXT: s_add_u32 s0, s0, s4 329; GCN3-NEXT: s_addc_u32 s1, s1, s5 330; GCN3-NEXT: v_mov_b32_e32 v0, s0 331; GCN3-NEXT: v_mov_b32_e32 v1, s1 332; GCN3-NEXT: v_mov_b32_e32 v2, s8 333; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc 334; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 335; GCN3-NEXT: buffer_wbinvl1_vol 336; GCN3-NEXT: v_mov_b32_e32 v0, s2 337; GCN3-NEXT: v_mov_b32_e32 v1, s3 338; GCN3-NEXT: flat_store_dword v[0:1], v2 339; GCN3-NEXT: s_endpgm 340entry: 341 %ptr = getelementptr i32, ptr %out, i64 %index 342 %gep = getelementptr i32, ptr %ptr, i32 4 343 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst 344 store i32 %val, ptr %out2 345 ret void 346} 347 348define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { 349; GCN1-LABEL: atomic_add_i32: 350; GCN1: ; %bb.0: ; %entry 351; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 352; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 353; GCN1-NEXT: s_waitcnt lgkmcnt(0) 354; GCN1-NEXT: v_mov_b32_e32 v0, s0 355; GCN1-NEXT: v_mov_b32_e32 v1, s1 356; GCN1-NEXT: v_mov_b32_e32 v2, s2 357; GCN1-NEXT: flat_atomic_add v[0:1], v2 358; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 359; GCN1-NEXT: buffer_wbinvl1_vol 360; GCN1-NEXT: s_endpgm 361; 362; GCN2-LABEL: atomic_add_i32: 363; GCN2: ; %bb.0: ; %entry 364; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 365; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 366; GCN2-NEXT: s_waitcnt lgkmcnt(0) 367; GCN2-NEXT: v_mov_b32_e32 v0, s0 368; GCN2-NEXT: v_mov_b32_e32 v1, s1 369; GCN2-NEXT: v_mov_b32_e32 v2, s2 370; GCN2-NEXT: flat_atomic_add v[0:1], v2 371; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 372; GCN2-NEXT: buffer_wbinvl1_vol 373; GCN2-NEXT: s_endpgm 374; 375; GCN3-LABEL: atomic_add_i32: 376; GCN3: ; %bb.0: ; %entry 377; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 378; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 379; GCN3-NEXT: s_waitcnt lgkmcnt(0) 380; GCN3-NEXT: v_mov_b32_e32 v0, s0 381; GCN3-NEXT: v_mov_b32_e32 v1, s1 382; GCN3-NEXT: v_mov_b32_e32 v2, s2 383; GCN3-NEXT: flat_atomic_add v[0:1], v2 384; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 385; GCN3-NEXT: buffer_wbinvl1_vol 386; GCN3-NEXT: s_endpgm 387entry: 388 %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst 389 ret void 390} 391 392define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { 393; GCN1-LABEL: atomic_add_i32_ret: 394; GCN1: ; %bb.0: ; %entry 395; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 396; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 397; GCN1-NEXT: s_waitcnt lgkmcnt(0) 398; GCN1-NEXT: v_mov_b32_e32 v0, s0 399; GCN1-NEXT: v_mov_b32_e32 v1, s1 400; GCN1-NEXT: v_mov_b32_e32 v2, s4 401; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc 402; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 403; GCN1-NEXT: buffer_wbinvl1_vol 404; GCN1-NEXT: v_mov_b32_e32 v0, s2 405; GCN1-NEXT: v_mov_b32_e32 v1, s3 406; GCN1-NEXT: flat_store_dword v[0:1], v2 407; GCN1-NEXT: s_endpgm 408; 409; GCN2-LABEL: atomic_add_i32_ret: 410; GCN2: ; %bb.0: ; %entry 411; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 412; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 413; GCN2-NEXT: s_waitcnt lgkmcnt(0) 414; GCN2-NEXT: v_mov_b32_e32 v0, s0 415; GCN2-NEXT: v_mov_b32_e32 v1, s1 416; GCN2-NEXT: v_mov_b32_e32 v2, s4 417; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc 418; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 419; GCN2-NEXT: buffer_wbinvl1_vol 420; GCN2-NEXT: v_mov_b32_e32 v0, s2 421; GCN2-NEXT: v_mov_b32_e32 v1, s3 422; GCN2-NEXT: flat_store_dword v[0:1], v2 423; GCN2-NEXT: s_endpgm 424; 425; GCN3-LABEL: atomic_add_i32_ret: 426; GCN3: ; %bb.0: ; %entry 427; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 428; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 429; GCN3-NEXT: s_waitcnt lgkmcnt(0) 430; GCN3-NEXT: v_mov_b32_e32 v0, s0 431; GCN3-NEXT: v_mov_b32_e32 v1, s1 432; GCN3-NEXT: v_mov_b32_e32 v2, s6 433; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc 434; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 435; GCN3-NEXT: buffer_wbinvl1_vol 436; GCN3-NEXT: v_mov_b32_e32 v0, s2 437; GCN3-NEXT: v_mov_b32_e32 v1, s3 438; GCN3-NEXT: flat_store_dword v[0:1], v2 439; GCN3-NEXT: s_endpgm 440entry: 441 %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst 442 store i32 %val, ptr %out2 443 ret void 444} 445 446define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { 447; GCN1-LABEL: atomic_add_i32_addr64: 448; GCN1: ; %bb.0: ; %entry 449; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 450; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 451; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 452; GCN1-NEXT: s_waitcnt lgkmcnt(0) 453; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 454; GCN1-NEXT: s_add_u32 s0, s2, s0 455; GCN1-NEXT: s_addc_u32 s1, s3, s1 456; GCN1-NEXT: v_mov_b32_e32 v0, s0 457; GCN1-NEXT: v_mov_b32_e32 v1, s1 458; GCN1-NEXT: v_mov_b32_e32 v2, s4 459; GCN1-NEXT: flat_atomic_add v[0:1], v2 460; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 461; GCN1-NEXT: buffer_wbinvl1_vol 462; GCN1-NEXT: s_endpgm 463; 464; GCN2-LABEL: atomic_add_i32_addr64: 465; GCN2: ; %bb.0: ; %entry 466; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 467; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 468; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 469; GCN2-NEXT: s_waitcnt lgkmcnt(0) 470; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 471; GCN2-NEXT: s_add_u32 s0, s2, s0 472; GCN2-NEXT: s_addc_u32 s1, s3, s1 473; GCN2-NEXT: v_mov_b32_e32 v0, s0 474; GCN2-NEXT: v_mov_b32_e32 v1, s1 475; GCN2-NEXT: v_mov_b32_e32 v2, s4 476; GCN2-NEXT: flat_atomic_add v[0:1], v2 477; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 478; GCN2-NEXT: buffer_wbinvl1_vol 479; GCN2-NEXT: s_endpgm 480; 481; GCN3-LABEL: atomic_add_i32_addr64: 482; GCN3: ; %bb.0: ; %entry 483; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 484; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 485; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 486; GCN3-NEXT: s_waitcnt lgkmcnt(0) 487; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 488; GCN3-NEXT: s_add_u32 s0, s2, s0 489; GCN3-NEXT: s_addc_u32 s1, s3, s1 490; GCN3-NEXT: v_mov_b32_e32 v0, s0 491; GCN3-NEXT: v_mov_b32_e32 v1, s1 492; GCN3-NEXT: v_mov_b32_e32 v2, s6 493; GCN3-NEXT: flat_atomic_add v[0:1], v2 494; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 495; GCN3-NEXT: buffer_wbinvl1_vol 496; GCN3-NEXT: s_endpgm 497entry: 498 %ptr = getelementptr i32, ptr %out, i64 %index 499 %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst 500 ret void 501} 502 503define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 504; GCN1-LABEL: atomic_add_i32_ret_addr64: 505; GCN1: ; %bb.0: ; %entry 506; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 507; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 508; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 509; GCN1-NEXT: s_waitcnt lgkmcnt(0) 510; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 511; GCN1-NEXT: s_add_u32 s0, s0, s4 512; GCN1-NEXT: s_addc_u32 s1, s1, s5 513; GCN1-NEXT: v_mov_b32_e32 v0, s0 514; GCN1-NEXT: v_mov_b32_e32 v1, s1 515; GCN1-NEXT: v_mov_b32_e32 v2, s8 516; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc 517; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 518; GCN1-NEXT: buffer_wbinvl1_vol 519; GCN1-NEXT: v_mov_b32_e32 v0, s2 520; GCN1-NEXT: v_mov_b32_e32 v1, s3 521; GCN1-NEXT: flat_store_dword v[0:1], v2 522; GCN1-NEXT: s_endpgm 523; 524; GCN2-LABEL: atomic_add_i32_ret_addr64: 525; GCN2: ; %bb.0: ; %entry 526; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 527; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 528; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 529; GCN2-NEXT: s_waitcnt lgkmcnt(0) 530; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 531; GCN2-NEXT: s_add_u32 s0, s0, s4 532; GCN2-NEXT: s_addc_u32 s1, s1, s5 533; GCN2-NEXT: v_mov_b32_e32 v0, s0 534; GCN2-NEXT: v_mov_b32_e32 v1, s1 535; GCN2-NEXT: v_mov_b32_e32 v2, s8 536; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc 537; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 538; GCN2-NEXT: buffer_wbinvl1_vol 539; GCN2-NEXT: v_mov_b32_e32 v0, s2 540; GCN2-NEXT: v_mov_b32_e32 v1, s3 541; GCN2-NEXT: flat_store_dword v[0:1], v2 542; GCN2-NEXT: s_endpgm 543; 544; GCN3-LABEL: atomic_add_i32_ret_addr64: 545; GCN3: ; %bb.0: ; %entry 546; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 547; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 548; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 549; GCN3-NEXT: s_waitcnt lgkmcnt(0) 550; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 551; GCN3-NEXT: s_add_u32 s0, s0, s4 552; GCN3-NEXT: s_addc_u32 s1, s1, s5 553; GCN3-NEXT: v_mov_b32_e32 v0, s0 554; GCN3-NEXT: v_mov_b32_e32 v1, s1 555; GCN3-NEXT: v_mov_b32_e32 v2, s8 556; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc 557; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 558; GCN3-NEXT: buffer_wbinvl1_vol 559; GCN3-NEXT: v_mov_b32_e32 v0, s2 560; GCN3-NEXT: v_mov_b32_e32 v1, s3 561; GCN3-NEXT: flat_store_dword v[0:1], v2 562; GCN3-NEXT: s_endpgm 563entry: 564 %ptr = getelementptr i32, ptr %out, i64 %index 565 %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst 566 store i32 %val, ptr %out2 567 ret void 568} 569 570define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { 571; GCN1-LABEL: atomic_and_i32_offset: 572; GCN1: ; %bb.0: ; %entry 573; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 574; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 575; GCN1-NEXT: s_waitcnt lgkmcnt(0) 576; GCN1-NEXT: s_add_u32 s0, s0, 16 577; GCN1-NEXT: s_addc_u32 s1, s1, 0 578; GCN1-NEXT: v_mov_b32_e32 v0, s0 579; GCN1-NEXT: v_mov_b32_e32 v1, s1 580; GCN1-NEXT: v_mov_b32_e32 v2, s2 581; GCN1-NEXT: flat_atomic_and v[0:1], v2 582; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 583; GCN1-NEXT: buffer_wbinvl1_vol 584; GCN1-NEXT: s_endpgm 585; 586; GCN2-LABEL: atomic_and_i32_offset: 587; GCN2: ; %bb.0: ; %entry 588; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 589; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 590; GCN2-NEXT: s_waitcnt lgkmcnt(0) 591; GCN2-NEXT: s_add_u32 s0, s0, 16 592; GCN2-NEXT: s_addc_u32 s1, s1, 0 593; GCN2-NEXT: v_mov_b32_e32 v0, s0 594; GCN2-NEXT: v_mov_b32_e32 v1, s1 595; GCN2-NEXT: v_mov_b32_e32 v2, s2 596; GCN2-NEXT: flat_atomic_and v[0:1], v2 597; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 598; GCN2-NEXT: buffer_wbinvl1_vol 599; GCN2-NEXT: s_endpgm 600; 601; GCN3-LABEL: atomic_and_i32_offset: 602; GCN3: ; %bb.0: ; %entry 603; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 604; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 605; GCN3-NEXT: s_waitcnt lgkmcnt(0) 606; GCN3-NEXT: v_mov_b32_e32 v0, s0 607; GCN3-NEXT: v_mov_b32_e32 v1, s1 608; GCN3-NEXT: v_mov_b32_e32 v2, s2 609; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 610; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 611; GCN3-NEXT: buffer_wbinvl1_vol 612; GCN3-NEXT: s_endpgm 613entry: 614 %gep = getelementptr i32, ptr %out, i32 4 615 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst 616 ret void 617} 618 619define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 620; GCN1-LABEL: atomic_and_i32_ret_offset: 621; GCN1: ; %bb.0: ; %entry 622; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 623; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 624; GCN1-NEXT: s_waitcnt lgkmcnt(0) 625; GCN1-NEXT: s_add_u32 s0, s0, 16 626; GCN1-NEXT: s_addc_u32 s1, s1, 0 627; GCN1-NEXT: v_mov_b32_e32 v0, s0 628; GCN1-NEXT: v_mov_b32_e32 v1, s1 629; GCN1-NEXT: v_mov_b32_e32 v2, s4 630; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc 631; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 632; GCN1-NEXT: buffer_wbinvl1_vol 633; GCN1-NEXT: v_mov_b32_e32 v0, s2 634; GCN1-NEXT: v_mov_b32_e32 v1, s3 635; GCN1-NEXT: flat_store_dword v[0:1], v2 636; GCN1-NEXT: s_endpgm 637; 638; GCN2-LABEL: atomic_and_i32_ret_offset: 639; GCN2: ; %bb.0: ; %entry 640; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 641; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 642; GCN2-NEXT: s_waitcnt lgkmcnt(0) 643; GCN2-NEXT: s_add_u32 s0, s0, 16 644; GCN2-NEXT: s_addc_u32 s1, s1, 0 645; GCN2-NEXT: v_mov_b32_e32 v0, s0 646; GCN2-NEXT: v_mov_b32_e32 v1, s1 647; GCN2-NEXT: v_mov_b32_e32 v2, s4 648; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc 649; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 650; GCN2-NEXT: buffer_wbinvl1_vol 651; GCN2-NEXT: v_mov_b32_e32 v0, s2 652; GCN2-NEXT: v_mov_b32_e32 v1, s3 653; GCN2-NEXT: flat_store_dword v[0:1], v2 654; GCN2-NEXT: s_endpgm 655; 656; GCN3-LABEL: atomic_and_i32_ret_offset: 657; GCN3: ; %bb.0: ; %entry 658; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 659; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 660; GCN3-NEXT: s_waitcnt lgkmcnt(0) 661; GCN3-NEXT: v_mov_b32_e32 v0, s0 662; GCN3-NEXT: v_mov_b32_e32 v1, s1 663; GCN3-NEXT: v_mov_b32_e32 v2, s6 664; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc 665; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 666; GCN3-NEXT: buffer_wbinvl1_vol 667; GCN3-NEXT: v_mov_b32_e32 v0, s2 668; GCN3-NEXT: v_mov_b32_e32 v1, s3 669; GCN3-NEXT: flat_store_dword v[0:1], v2 670; GCN3-NEXT: s_endpgm 671entry: 672 %gep = getelementptr i32, ptr %out, i32 4 673 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst 674 store i32 %val, ptr %out2 675 ret void 676} 677 678define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 679; GCN1-LABEL: atomic_and_i32_addr64_offset: 680; GCN1: ; %bb.0: ; %entry 681; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 682; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 683; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 684; GCN1-NEXT: s_waitcnt lgkmcnt(0) 685; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 686; GCN1-NEXT: s_add_u32 s0, s2, s0 687; GCN1-NEXT: s_addc_u32 s1, s3, s1 688; GCN1-NEXT: s_add_u32 s0, s0, 16 689; GCN1-NEXT: s_addc_u32 s1, s1, 0 690; GCN1-NEXT: v_mov_b32_e32 v0, s0 691; GCN1-NEXT: v_mov_b32_e32 v1, s1 692; GCN1-NEXT: v_mov_b32_e32 v2, s4 693; GCN1-NEXT: flat_atomic_and v[0:1], v2 694; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 695; GCN1-NEXT: buffer_wbinvl1_vol 696; GCN1-NEXT: s_endpgm 697; 698; GCN2-LABEL: atomic_and_i32_addr64_offset: 699; GCN2: ; %bb.0: ; %entry 700; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 701; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 702; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 703; GCN2-NEXT: s_waitcnt lgkmcnt(0) 704; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 705; GCN2-NEXT: s_add_u32 s0, s2, s0 706; GCN2-NEXT: s_addc_u32 s1, s3, s1 707; GCN2-NEXT: s_add_u32 s0, s0, 16 708; GCN2-NEXT: s_addc_u32 s1, s1, 0 709; GCN2-NEXT: v_mov_b32_e32 v0, s0 710; GCN2-NEXT: v_mov_b32_e32 v1, s1 711; GCN2-NEXT: v_mov_b32_e32 v2, s4 712; GCN2-NEXT: flat_atomic_and v[0:1], v2 713; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 714; GCN2-NEXT: buffer_wbinvl1_vol 715; GCN2-NEXT: s_endpgm 716; 717; GCN3-LABEL: atomic_and_i32_addr64_offset: 718; GCN3: ; %bb.0: ; %entry 719; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 720; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 721; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 722; GCN3-NEXT: s_waitcnt lgkmcnt(0) 723; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 724; GCN3-NEXT: s_add_u32 s0, s2, s0 725; GCN3-NEXT: s_addc_u32 s1, s3, s1 726; GCN3-NEXT: v_mov_b32_e32 v0, s0 727; GCN3-NEXT: v_mov_b32_e32 v1, s1 728; GCN3-NEXT: v_mov_b32_e32 v2, s6 729; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 730; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 731; GCN3-NEXT: buffer_wbinvl1_vol 732; GCN3-NEXT: s_endpgm 733entry: 734 %ptr = getelementptr i32, ptr %out, i64 %index 735 %gep = getelementptr i32, ptr %ptr, i32 4 736 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst 737 ret void 738} 739 740define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 741; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: 742; GCN1: ; %bb.0: ; %entry 743; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 744; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 745; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 746; GCN1-NEXT: s_waitcnt lgkmcnt(0) 747; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 748; GCN1-NEXT: s_add_u32 s0, s0, s4 749; GCN1-NEXT: s_addc_u32 s1, s1, s5 750; GCN1-NEXT: s_add_u32 s0, s0, 16 751; GCN1-NEXT: s_addc_u32 s1, s1, 0 752; GCN1-NEXT: v_mov_b32_e32 v0, s0 753; GCN1-NEXT: v_mov_b32_e32 v1, s1 754; GCN1-NEXT: v_mov_b32_e32 v2, s8 755; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc 756; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 757; GCN1-NEXT: buffer_wbinvl1_vol 758; GCN1-NEXT: v_mov_b32_e32 v0, s2 759; GCN1-NEXT: v_mov_b32_e32 v1, s3 760; GCN1-NEXT: flat_store_dword v[0:1], v2 761; GCN1-NEXT: s_endpgm 762; 763; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: 764; GCN2: ; %bb.0: ; %entry 765; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 766; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 767; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 768; GCN2-NEXT: s_waitcnt lgkmcnt(0) 769; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 770; GCN2-NEXT: s_add_u32 s0, s0, s4 771; GCN2-NEXT: s_addc_u32 s1, s1, s5 772; GCN2-NEXT: s_add_u32 s0, s0, 16 773; GCN2-NEXT: s_addc_u32 s1, s1, 0 774; GCN2-NEXT: v_mov_b32_e32 v0, s0 775; GCN2-NEXT: v_mov_b32_e32 v1, s1 776; GCN2-NEXT: v_mov_b32_e32 v2, s8 777; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc 778; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 779; GCN2-NEXT: buffer_wbinvl1_vol 780; GCN2-NEXT: v_mov_b32_e32 v0, s2 781; GCN2-NEXT: v_mov_b32_e32 v1, s3 782; GCN2-NEXT: flat_store_dword v[0:1], v2 783; GCN2-NEXT: s_endpgm 784; 785; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: 786; GCN3: ; %bb.0: ; %entry 787; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 788; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 789; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 790; GCN3-NEXT: s_waitcnt lgkmcnt(0) 791; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 792; GCN3-NEXT: s_add_u32 s0, s0, s4 793; GCN3-NEXT: s_addc_u32 s1, s1, s5 794; GCN3-NEXT: v_mov_b32_e32 v0, s0 795; GCN3-NEXT: v_mov_b32_e32 v1, s1 796; GCN3-NEXT: v_mov_b32_e32 v2, s8 797; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc 798; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 799; GCN3-NEXT: buffer_wbinvl1_vol 800; GCN3-NEXT: v_mov_b32_e32 v0, s2 801; GCN3-NEXT: v_mov_b32_e32 v1, s3 802; GCN3-NEXT: flat_store_dword v[0:1], v2 803; GCN3-NEXT: s_endpgm 804entry: 805 %ptr = getelementptr i32, ptr %out, i64 %index 806 %gep = getelementptr i32, ptr %ptr, i32 4 807 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst 808 store i32 %val, ptr %out2 809 ret void 810} 811 812define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { 813; GCN1-LABEL: atomic_and_i32: 814; GCN1: ; %bb.0: ; %entry 815; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 816; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 817; GCN1-NEXT: s_waitcnt lgkmcnt(0) 818; GCN1-NEXT: v_mov_b32_e32 v0, s0 819; GCN1-NEXT: v_mov_b32_e32 v1, s1 820; GCN1-NEXT: v_mov_b32_e32 v2, s2 821; GCN1-NEXT: flat_atomic_and v[0:1], v2 822; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 823; GCN1-NEXT: buffer_wbinvl1_vol 824; GCN1-NEXT: s_endpgm 825; 826; GCN2-LABEL: atomic_and_i32: 827; GCN2: ; %bb.0: ; %entry 828; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 829; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 830; GCN2-NEXT: s_waitcnt lgkmcnt(0) 831; GCN2-NEXT: v_mov_b32_e32 v0, s0 832; GCN2-NEXT: v_mov_b32_e32 v1, s1 833; GCN2-NEXT: v_mov_b32_e32 v2, s2 834; GCN2-NEXT: flat_atomic_and v[0:1], v2 835; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 836; GCN2-NEXT: buffer_wbinvl1_vol 837; GCN2-NEXT: s_endpgm 838; 839; GCN3-LABEL: atomic_and_i32: 840; GCN3: ; %bb.0: ; %entry 841; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 842; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 843; GCN3-NEXT: s_waitcnt lgkmcnt(0) 844; GCN3-NEXT: v_mov_b32_e32 v0, s0 845; GCN3-NEXT: v_mov_b32_e32 v1, s1 846; GCN3-NEXT: v_mov_b32_e32 v2, s2 847; GCN3-NEXT: flat_atomic_and v[0:1], v2 848; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 849; GCN3-NEXT: buffer_wbinvl1_vol 850; GCN3-NEXT: s_endpgm 851entry: 852 %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst 853 ret void 854} 855 856define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { 857; GCN1-LABEL: atomic_and_i32_ret: 858; GCN1: ; %bb.0: ; %entry 859; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 860; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 861; GCN1-NEXT: s_waitcnt lgkmcnt(0) 862; GCN1-NEXT: v_mov_b32_e32 v0, s0 863; GCN1-NEXT: v_mov_b32_e32 v1, s1 864; GCN1-NEXT: v_mov_b32_e32 v2, s4 865; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc 866; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 867; GCN1-NEXT: buffer_wbinvl1_vol 868; GCN1-NEXT: v_mov_b32_e32 v0, s2 869; GCN1-NEXT: v_mov_b32_e32 v1, s3 870; GCN1-NEXT: flat_store_dword v[0:1], v2 871; GCN1-NEXT: s_endpgm 872; 873; GCN2-LABEL: atomic_and_i32_ret: 874; GCN2: ; %bb.0: ; %entry 875; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 876; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 877; GCN2-NEXT: s_waitcnt lgkmcnt(0) 878; GCN2-NEXT: v_mov_b32_e32 v0, s0 879; GCN2-NEXT: v_mov_b32_e32 v1, s1 880; GCN2-NEXT: v_mov_b32_e32 v2, s4 881; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc 882; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 883; GCN2-NEXT: buffer_wbinvl1_vol 884; GCN2-NEXT: v_mov_b32_e32 v0, s2 885; GCN2-NEXT: v_mov_b32_e32 v1, s3 886; GCN2-NEXT: flat_store_dword v[0:1], v2 887; GCN2-NEXT: s_endpgm 888; 889; GCN3-LABEL: atomic_and_i32_ret: 890; GCN3: ; %bb.0: ; %entry 891; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 892; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 893; GCN3-NEXT: s_waitcnt lgkmcnt(0) 894; GCN3-NEXT: v_mov_b32_e32 v0, s0 895; GCN3-NEXT: v_mov_b32_e32 v1, s1 896; GCN3-NEXT: v_mov_b32_e32 v2, s6 897; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc 898; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 899; GCN3-NEXT: buffer_wbinvl1_vol 900; GCN3-NEXT: v_mov_b32_e32 v0, s2 901; GCN3-NEXT: v_mov_b32_e32 v1, s3 902; GCN3-NEXT: flat_store_dword v[0:1], v2 903; GCN3-NEXT: s_endpgm 904entry: 905 %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst 906 store i32 %val, ptr %out2 907 ret void 908} 909 910define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { 911; GCN1-LABEL: atomic_and_i32_addr64: 912; GCN1: ; %bb.0: ; %entry 913; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 914; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 915; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 916; GCN1-NEXT: s_waitcnt lgkmcnt(0) 917; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 918; GCN1-NEXT: s_add_u32 s0, s2, s0 919; GCN1-NEXT: s_addc_u32 s1, s3, s1 920; GCN1-NEXT: v_mov_b32_e32 v0, s0 921; GCN1-NEXT: v_mov_b32_e32 v1, s1 922; GCN1-NEXT: v_mov_b32_e32 v2, s4 923; GCN1-NEXT: flat_atomic_and v[0:1], v2 924; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 925; GCN1-NEXT: buffer_wbinvl1_vol 926; GCN1-NEXT: s_endpgm 927; 928; GCN2-LABEL: atomic_and_i32_addr64: 929; GCN2: ; %bb.0: ; %entry 930; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 931; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 932; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 933; GCN2-NEXT: s_waitcnt lgkmcnt(0) 934; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 935; GCN2-NEXT: s_add_u32 s0, s2, s0 936; GCN2-NEXT: s_addc_u32 s1, s3, s1 937; GCN2-NEXT: v_mov_b32_e32 v0, s0 938; GCN2-NEXT: v_mov_b32_e32 v1, s1 939; GCN2-NEXT: v_mov_b32_e32 v2, s4 940; GCN2-NEXT: flat_atomic_and v[0:1], v2 941; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 942; GCN2-NEXT: buffer_wbinvl1_vol 943; GCN2-NEXT: s_endpgm 944; 945; GCN3-LABEL: atomic_and_i32_addr64: 946; GCN3: ; %bb.0: ; %entry 947; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 948; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 949; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 950; GCN3-NEXT: s_waitcnt lgkmcnt(0) 951; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 952; GCN3-NEXT: s_add_u32 s0, s2, s0 953; GCN3-NEXT: s_addc_u32 s1, s3, s1 954; GCN3-NEXT: v_mov_b32_e32 v0, s0 955; GCN3-NEXT: v_mov_b32_e32 v1, s1 956; GCN3-NEXT: v_mov_b32_e32 v2, s6 957; GCN3-NEXT: flat_atomic_and v[0:1], v2 958; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 959; GCN3-NEXT: buffer_wbinvl1_vol 960; GCN3-NEXT: s_endpgm 961entry: 962 %ptr = getelementptr i32, ptr %out, i64 %index 963 %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst 964 ret void 965} 966 967define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 968; GCN1-LABEL: atomic_and_i32_ret_addr64: 969; GCN1: ; %bb.0: ; %entry 970; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 971; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 972; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 973; GCN1-NEXT: s_waitcnt lgkmcnt(0) 974; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 975; GCN1-NEXT: s_add_u32 s0, s0, s4 976; GCN1-NEXT: s_addc_u32 s1, s1, s5 977; GCN1-NEXT: v_mov_b32_e32 v0, s0 978; GCN1-NEXT: v_mov_b32_e32 v1, s1 979; GCN1-NEXT: v_mov_b32_e32 v2, s8 980; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc 981; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 982; GCN1-NEXT: buffer_wbinvl1_vol 983; GCN1-NEXT: v_mov_b32_e32 v0, s2 984; GCN1-NEXT: v_mov_b32_e32 v1, s3 985; GCN1-NEXT: flat_store_dword v[0:1], v2 986; GCN1-NEXT: s_endpgm 987; 988; GCN2-LABEL: atomic_and_i32_ret_addr64: 989; GCN2: ; %bb.0: ; %entry 990; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 991; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 992; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 993; GCN2-NEXT: s_waitcnt lgkmcnt(0) 994; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 995; GCN2-NEXT: s_add_u32 s0, s0, s4 996; GCN2-NEXT: s_addc_u32 s1, s1, s5 997; GCN2-NEXT: v_mov_b32_e32 v0, s0 998; GCN2-NEXT: v_mov_b32_e32 v1, s1 999; GCN2-NEXT: v_mov_b32_e32 v2, s8 1000; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc 1001; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1002; GCN2-NEXT: buffer_wbinvl1_vol 1003; GCN2-NEXT: v_mov_b32_e32 v0, s2 1004; GCN2-NEXT: v_mov_b32_e32 v1, s3 1005; GCN2-NEXT: flat_store_dword v[0:1], v2 1006; GCN2-NEXT: s_endpgm 1007; 1008; GCN3-LABEL: atomic_and_i32_ret_addr64: 1009; GCN3: ; %bb.0: ; %entry 1010; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1011; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1012; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 1013; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1014; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1015; GCN3-NEXT: s_add_u32 s0, s0, s4 1016; GCN3-NEXT: s_addc_u32 s1, s1, s5 1017; GCN3-NEXT: v_mov_b32_e32 v0, s0 1018; GCN3-NEXT: v_mov_b32_e32 v1, s1 1019; GCN3-NEXT: v_mov_b32_e32 v2, s8 1020; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc 1021; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1022; GCN3-NEXT: buffer_wbinvl1_vol 1023; GCN3-NEXT: v_mov_b32_e32 v0, s2 1024; GCN3-NEXT: v_mov_b32_e32 v1, s3 1025; GCN3-NEXT: flat_store_dword v[0:1], v2 1026; GCN3-NEXT: s_endpgm 1027entry: 1028 %ptr = getelementptr i32, ptr %out, i64 %index 1029 %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst 1030 store i32 %val, ptr %out2 1031 ret void 1032} 1033 1034define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { 1035; GCN1-LABEL: atomic_sub_i32_offset: 1036; GCN1: ; %bb.0: ; %entry 1037; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1038; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 1039; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1040; GCN1-NEXT: s_add_u32 s0, s0, 16 1041; GCN1-NEXT: s_addc_u32 s1, s1, 0 1042; GCN1-NEXT: v_mov_b32_e32 v0, s0 1043; GCN1-NEXT: v_mov_b32_e32 v1, s1 1044; GCN1-NEXT: v_mov_b32_e32 v2, s2 1045; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1046; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1047; GCN1-NEXT: buffer_wbinvl1_vol 1048; GCN1-NEXT: s_endpgm 1049; 1050; GCN2-LABEL: atomic_sub_i32_offset: 1051; GCN2: ; %bb.0: ; %entry 1052; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1053; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 1054; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1055; GCN2-NEXT: s_add_u32 s0, s0, 16 1056; GCN2-NEXT: s_addc_u32 s1, s1, 0 1057; GCN2-NEXT: v_mov_b32_e32 v0, s0 1058; GCN2-NEXT: v_mov_b32_e32 v1, s1 1059; GCN2-NEXT: v_mov_b32_e32 v2, s2 1060; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1061; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1062; GCN2-NEXT: buffer_wbinvl1_vol 1063; GCN2-NEXT: s_endpgm 1064; 1065; GCN3-LABEL: atomic_sub_i32_offset: 1066; GCN3: ; %bb.0: ; %entry 1067; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1068; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 1069; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1070; GCN3-NEXT: v_mov_b32_e32 v0, s0 1071; GCN3-NEXT: v_mov_b32_e32 v1, s1 1072; GCN3-NEXT: v_mov_b32_e32 v2, s2 1073; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 1074; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1075; GCN3-NEXT: buffer_wbinvl1_vol 1076; GCN3-NEXT: s_endpgm 1077entry: 1078 %gep = getelementptr i32, ptr %out, i32 4 1079 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst 1080 ret void 1081} 1082 1083define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 1084; GCN1-LABEL: atomic_sub_i32_ret_offset: 1085; GCN1: ; %bb.0: ; %entry 1086; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1087; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 1088; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1089; GCN1-NEXT: s_add_u32 s0, s0, 16 1090; GCN1-NEXT: s_addc_u32 s1, s1, 0 1091; GCN1-NEXT: v_mov_b32_e32 v0, s0 1092; GCN1-NEXT: v_mov_b32_e32 v1, s1 1093; GCN1-NEXT: v_mov_b32_e32 v2, s4 1094; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1095; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1096; GCN1-NEXT: buffer_wbinvl1_vol 1097; GCN1-NEXT: v_mov_b32_e32 v0, s2 1098; GCN1-NEXT: v_mov_b32_e32 v1, s3 1099; GCN1-NEXT: flat_store_dword v[0:1], v2 1100; GCN1-NEXT: s_endpgm 1101; 1102; GCN2-LABEL: atomic_sub_i32_ret_offset: 1103; GCN2: ; %bb.0: ; %entry 1104; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1105; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 1106; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1107; GCN2-NEXT: s_add_u32 s0, s0, 16 1108; GCN2-NEXT: s_addc_u32 s1, s1, 0 1109; GCN2-NEXT: v_mov_b32_e32 v0, s0 1110; GCN2-NEXT: v_mov_b32_e32 v1, s1 1111; GCN2-NEXT: v_mov_b32_e32 v2, s4 1112; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1113; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1114; GCN2-NEXT: buffer_wbinvl1_vol 1115; GCN2-NEXT: v_mov_b32_e32 v0, s2 1116; GCN2-NEXT: v_mov_b32_e32 v1, s3 1117; GCN2-NEXT: flat_store_dword v[0:1], v2 1118; GCN2-NEXT: s_endpgm 1119; 1120; GCN3-LABEL: atomic_sub_i32_ret_offset: 1121; GCN3: ; %bb.0: ; %entry 1122; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1123; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 1124; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1125; GCN3-NEXT: v_mov_b32_e32 v0, s0 1126; GCN3-NEXT: v_mov_b32_e32 v1, s1 1127; GCN3-NEXT: v_mov_b32_e32 v2, s6 1128; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc 1129; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1130; GCN3-NEXT: buffer_wbinvl1_vol 1131; GCN3-NEXT: v_mov_b32_e32 v0, s2 1132; GCN3-NEXT: v_mov_b32_e32 v1, s3 1133; GCN3-NEXT: flat_store_dword v[0:1], v2 1134; GCN3-NEXT: s_endpgm 1135entry: 1136 %gep = getelementptr i32, ptr %out, i32 4 1137 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst 1138 store i32 %val, ptr %out2 1139 ret void 1140} 1141 1142define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 1143; GCN1-LABEL: atomic_sub_i32_addr64_offset: 1144; GCN1: ; %bb.0: ; %entry 1145; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1146; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 1147; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 1148; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1149; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1150; GCN1-NEXT: s_add_u32 s0, s2, s0 1151; GCN1-NEXT: s_addc_u32 s1, s3, s1 1152; GCN1-NEXT: s_add_u32 s0, s0, 16 1153; GCN1-NEXT: s_addc_u32 s1, s1, 0 1154; GCN1-NEXT: v_mov_b32_e32 v0, s0 1155; GCN1-NEXT: v_mov_b32_e32 v1, s1 1156; GCN1-NEXT: v_mov_b32_e32 v2, s4 1157; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1158; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1159; GCN1-NEXT: buffer_wbinvl1_vol 1160; GCN1-NEXT: s_endpgm 1161; 1162; GCN2-LABEL: atomic_sub_i32_addr64_offset: 1163; GCN2: ; %bb.0: ; %entry 1164; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1165; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1166; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 1167; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1168; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1169; GCN2-NEXT: s_add_u32 s0, s2, s0 1170; GCN2-NEXT: s_addc_u32 s1, s3, s1 1171; GCN2-NEXT: s_add_u32 s0, s0, 16 1172; GCN2-NEXT: s_addc_u32 s1, s1, 0 1173; GCN2-NEXT: v_mov_b32_e32 v0, s0 1174; GCN2-NEXT: v_mov_b32_e32 v1, s1 1175; GCN2-NEXT: v_mov_b32_e32 v2, s4 1176; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1177; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1178; GCN2-NEXT: buffer_wbinvl1_vol 1179; GCN2-NEXT: s_endpgm 1180; 1181; GCN3-LABEL: atomic_sub_i32_addr64_offset: 1182; GCN3: ; %bb.0: ; %entry 1183; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1184; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1185; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 1186; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1187; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1188; GCN3-NEXT: s_add_u32 s0, s2, s0 1189; GCN3-NEXT: s_addc_u32 s1, s3, s1 1190; GCN3-NEXT: v_mov_b32_e32 v0, s0 1191; GCN3-NEXT: v_mov_b32_e32 v1, s1 1192; GCN3-NEXT: v_mov_b32_e32 v2, s6 1193; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 1194; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1195; GCN3-NEXT: buffer_wbinvl1_vol 1196; GCN3-NEXT: s_endpgm 1197entry: 1198 %ptr = getelementptr i32, ptr %out, i64 %index 1199 %gep = getelementptr i32, ptr %ptr, i32 4 1200 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst 1201 ret void 1202} 1203 1204define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 1205; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: 1206; GCN1: ; %bb.0: ; %entry 1207; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 1208; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1209; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 1210; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1211; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1212; GCN1-NEXT: s_add_u32 s0, s0, s4 1213; GCN1-NEXT: s_addc_u32 s1, s1, s5 1214; GCN1-NEXT: s_add_u32 s0, s0, 16 1215; GCN1-NEXT: s_addc_u32 s1, s1, 0 1216; GCN1-NEXT: v_mov_b32_e32 v0, s0 1217; GCN1-NEXT: v_mov_b32_e32 v1, s1 1218; GCN1-NEXT: v_mov_b32_e32 v2, s8 1219; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1220; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1221; GCN1-NEXT: buffer_wbinvl1_vol 1222; GCN1-NEXT: v_mov_b32_e32 v0, s2 1223; GCN1-NEXT: v_mov_b32_e32 v1, s3 1224; GCN1-NEXT: flat_store_dword v[0:1], v2 1225; GCN1-NEXT: s_endpgm 1226; 1227; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: 1228; GCN2: ; %bb.0: ; %entry 1229; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1230; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1231; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 1232; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1233; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1234; GCN2-NEXT: s_add_u32 s0, s0, s4 1235; GCN2-NEXT: s_addc_u32 s1, s1, s5 1236; GCN2-NEXT: s_add_u32 s0, s0, 16 1237; GCN2-NEXT: s_addc_u32 s1, s1, 0 1238; GCN2-NEXT: v_mov_b32_e32 v0, s0 1239; GCN2-NEXT: v_mov_b32_e32 v1, s1 1240; GCN2-NEXT: v_mov_b32_e32 v2, s8 1241; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1242; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1243; GCN2-NEXT: buffer_wbinvl1_vol 1244; GCN2-NEXT: v_mov_b32_e32 v0, s2 1245; GCN2-NEXT: v_mov_b32_e32 v1, s3 1246; GCN2-NEXT: flat_store_dword v[0:1], v2 1247; GCN2-NEXT: s_endpgm 1248; 1249; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: 1250; GCN3: ; %bb.0: ; %entry 1251; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1252; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1253; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 1254; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1255; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1256; GCN3-NEXT: s_add_u32 s0, s0, s4 1257; GCN3-NEXT: s_addc_u32 s1, s1, s5 1258; GCN3-NEXT: v_mov_b32_e32 v0, s0 1259; GCN3-NEXT: v_mov_b32_e32 v1, s1 1260; GCN3-NEXT: v_mov_b32_e32 v2, s8 1261; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc 1262; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1263; GCN3-NEXT: buffer_wbinvl1_vol 1264; GCN3-NEXT: v_mov_b32_e32 v0, s2 1265; GCN3-NEXT: v_mov_b32_e32 v1, s3 1266; GCN3-NEXT: flat_store_dword v[0:1], v2 1267; GCN3-NEXT: s_endpgm 1268entry: 1269 %ptr = getelementptr i32, ptr %out, i64 %index 1270 %gep = getelementptr i32, ptr %ptr, i32 4 1271 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst 1272 store i32 %val, ptr %out2 1273 ret void 1274} 1275 1276define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { 1277; GCN1-LABEL: atomic_sub_i32: 1278; GCN1: ; %bb.0: ; %entry 1279; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1280; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 1281; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1282; GCN1-NEXT: v_mov_b32_e32 v0, s0 1283; GCN1-NEXT: v_mov_b32_e32 v1, s1 1284; GCN1-NEXT: v_mov_b32_e32 v2, s2 1285; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1286; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1287; GCN1-NEXT: buffer_wbinvl1_vol 1288; GCN1-NEXT: s_endpgm 1289; 1290; GCN2-LABEL: atomic_sub_i32: 1291; GCN2: ; %bb.0: ; %entry 1292; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1293; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 1294; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1295; GCN2-NEXT: v_mov_b32_e32 v0, s0 1296; GCN2-NEXT: v_mov_b32_e32 v1, s1 1297; GCN2-NEXT: v_mov_b32_e32 v2, s2 1298; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1299; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1300; GCN2-NEXT: buffer_wbinvl1_vol 1301; GCN2-NEXT: s_endpgm 1302; 1303; GCN3-LABEL: atomic_sub_i32: 1304; GCN3: ; %bb.0: ; %entry 1305; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1306; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 1307; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1308; GCN3-NEXT: v_mov_b32_e32 v0, s0 1309; GCN3-NEXT: v_mov_b32_e32 v1, s1 1310; GCN3-NEXT: v_mov_b32_e32 v2, s2 1311; GCN3-NEXT: flat_atomic_sub v[0:1], v2 1312; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1313; GCN3-NEXT: buffer_wbinvl1_vol 1314; GCN3-NEXT: s_endpgm 1315entry: 1316 %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst 1317 ret void 1318} 1319 1320define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { 1321; GCN1-LABEL: atomic_sub_i32_ret: 1322; GCN1: ; %bb.0: ; %entry 1323; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1324; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 1325; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1326; GCN1-NEXT: v_mov_b32_e32 v0, s0 1327; GCN1-NEXT: v_mov_b32_e32 v1, s1 1328; GCN1-NEXT: v_mov_b32_e32 v2, s4 1329; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1330; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1331; GCN1-NEXT: buffer_wbinvl1_vol 1332; GCN1-NEXT: v_mov_b32_e32 v0, s2 1333; GCN1-NEXT: v_mov_b32_e32 v1, s3 1334; GCN1-NEXT: flat_store_dword v[0:1], v2 1335; GCN1-NEXT: s_endpgm 1336; 1337; GCN2-LABEL: atomic_sub_i32_ret: 1338; GCN2: ; %bb.0: ; %entry 1339; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1340; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 1341; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1342; GCN2-NEXT: v_mov_b32_e32 v0, s0 1343; GCN2-NEXT: v_mov_b32_e32 v1, s1 1344; GCN2-NEXT: v_mov_b32_e32 v2, s4 1345; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1346; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1347; GCN2-NEXT: buffer_wbinvl1_vol 1348; GCN2-NEXT: v_mov_b32_e32 v0, s2 1349; GCN2-NEXT: v_mov_b32_e32 v1, s3 1350; GCN2-NEXT: flat_store_dword v[0:1], v2 1351; GCN2-NEXT: s_endpgm 1352; 1353; GCN3-LABEL: atomic_sub_i32_ret: 1354; GCN3: ; %bb.0: ; %entry 1355; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1356; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 1357; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1358; GCN3-NEXT: v_mov_b32_e32 v0, s0 1359; GCN3-NEXT: v_mov_b32_e32 v1, s1 1360; GCN3-NEXT: v_mov_b32_e32 v2, s6 1361; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1362; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1363; GCN3-NEXT: buffer_wbinvl1_vol 1364; GCN3-NEXT: v_mov_b32_e32 v0, s2 1365; GCN3-NEXT: v_mov_b32_e32 v1, s3 1366; GCN3-NEXT: flat_store_dword v[0:1], v2 1367; GCN3-NEXT: s_endpgm 1368entry: 1369 %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst 1370 store i32 %val, ptr %out2 1371 ret void 1372} 1373 1374define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { 1375; GCN1-LABEL: atomic_sub_i32_addr64: 1376; GCN1: ; %bb.0: ; %entry 1377; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1378; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 1379; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 1380; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1381; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1382; GCN1-NEXT: s_add_u32 s0, s2, s0 1383; GCN1-NEXT: s_addc_u32 s1, s3, s1 1384; GCN1-NEXT: v_mov_b32_e32 v0, s0 1385; GCN1-NEXT: v_mov_b32_e32 v1, s1 1386; GCN1-NEXT: v_mov_b32_e32 v2, s4 1387; GCN1-NEXT: flat_atomic_sub v[0:1], v2 1388; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1389; GCN1-NEXT: buffer_wbinvl1_vol 1390; GCN1-NEXT: s_endpgm 1391; 1392; GCN2-LABEL: atomic_sub_i32_addr64: 1393; GCN2: ; %bb.0: ; %entry 1394; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1395; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1396; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 1397; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1398; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1399; GCN2-NEXT: s_add_u32 s0, s2, s0 1400; GCN2-NEXT: s_addc_u32 s1, s3, s1 1401; GCN2-NEXT: v_mov_b32_e32 v0, s0 1402; GCN2-NEXT: v_mov_b32_e32 v1, s1 1403; GCN2-NEXT: v_mov_b32_e32 v2, s4 1404; GCN2-NEXT: flat_atomic_sub v[0:1], v2 1405; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1406; GCN2-NEXT: buffer_wbinvl1_vol 1407; GCN2-NEXT: s_endpgm 1408; 1409; GCN3-LABEL: atomic_sub_i32_addr64: 1410; GCN3: ; %bb.0: ; %entry 1411; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1412; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1413; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 1414; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1415; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1416; GCN3-NEXT: s_add_u32 s0, s2, s0 1417; GCN3-NEXT: s_addc_u32 s1, s3, s1 1418; GCN3-NEXT: v_mov_b32_e32 v0, s0 1419; GCN3-NEXT: v_mov_b32_e32 v1, s1 1420; GCN3-NEXT: v_mov_b32_e32 v2, s6 1421; GCN3-NEXT: flat_atomic_sub v[0:1], v2 1422; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1423; GCN3-NEXT: buffer_wbinvl1_vol 1424; GCN3-NEXT: s_endpgm 1425entry: 1426 %ptr = getelementptr i32, ptr %out, i64 %index 1427 %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst 1428 ret void 1429} 1430 1431define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 1432; GCN1-LABEL: atomic_sub_i32_ret_addr64: 1433; GCN1: ; %bb.0: ; %entry 1434; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 1435; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1436; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 1437; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1438; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1439; GCN1-NEXT: s_add_u32 s0, s0, s4 1440; GCN1-NEXT: s_addc_u32 s1, s1, s5 1441; GCN1-NEXT: v_mov_b32_e32 v0, s0 1442; GCN1-NEXT: v_mov_b32_e32 v1, s1 1443; GCN1-NEXT: v_mov_b32_e32 v2, s8 1444; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1445; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1446; GCN1-NEXT: buffer_wbinvl1_vol 1447; GCN1-NEXT: v_mov_b32_e32 v0, s2 1448; GCN1-NEXT: v_mov_b32_e32 v1, s3 1449; GCN1-NEXT: flat_store_dword v[0:1], v2 1450; GCN1-NEXT: s_endpgm 1451; 1452; GCN2-LABEL: atomic_sub_i32_ret_addr64: 1453; GCN2: ; %bb.0: ; %entry 1454; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1455; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1456; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 1457; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1458; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1459; GCN2-NEXT: s_add_u32 s0, s0, s4 1460; GCN2-NEXT: s_addc_u32 s1, s1, s5 1461; GCN2-NEXT: v_mov_b32_e32 v0, s0 1462; GCN2-NEXT: v_mov_b32_e32 v1, s1 1463; GCN2-NEXT: v_mov_b32_e32 v2, s8 1464; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1465; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1466; GCN2-NEXT: buffer_wbinvl1_vol 1467; GCN2-NEXT: v_mov_b32_e32 v0, s2 1468; GCN2-NEXT: v_mov_b32_e32 v1, s3 1469; GCN2-NEXT: flat_store_dword v[0:1], v2 1470; GCN2-NEXT: s_endpgm 1471; 1472; GCN3-LABEL: atomic_sub_i32_ret_addr64: 1473; GCN3: ; %bb.0: ; %entry 1474; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1475; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1476; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 1477; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1478; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1479; GCN3-NEXT: s_add_u32 s0, s0, s4 1480; GCN3-NEXT: s_addc_u32 s1, s1, s5 1481; GCN3-NEXT: v_mov_b32_e32 v0, s0 1482; GCN3-NEXT: v_mov_b32_e32 v1, s1 1483; GCN3-NEXT: v_mov_b32_e32 v2, s8 1484; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc 1485; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1486; GCN3-NEXT: buffer_wbinvl1_vol 1487; GCN3-NEXT: v_mov_b32_e32 v0, s2 1488; GCN3-NEXT: v_mov_b32_e32 v1, s3 1489; GCN3-NEXT: flat_store_dword v[0:1], v2 1490; GCN3-NEXT: s_endpgm 1491entry: 1492 %ptr = getelementptr i32, ptr %out, i64 %index 1493 %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst 1494 store i32 %val, ptr %out2 1495 ret void 1496} 1497 1498define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { 1499; GCN1-LABEL: atomic_max_i32_offset: 1500; GCN1: ; %bb.0: ; %entry 1501; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1502; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 1503; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1504; GCN1-NEXT: s_add_u32 s0, s0, 16 1505; GCN1-NEXT: s_addc_u32 s1, s1, 0 1506; GCN1-NEXT: v_mov_b32_e32 v0, s0 1507; GCN1-NEXT: v_mov_b32_e32 v1, s1 1508; GCN1-NEXT: v_mov_b32_e32 v2, s2 1509; GCN1-NEXT: flat_atomic_smax v[0:1], v2 1510; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1511; GCN1-NEXT: s_endpgm 1512; 1513; GCN2-LABEL: atomic_max_i32_offset: 1514; GCN2: ; %bb.0: ; %entry 1515; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1516; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 1517; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1518; GCN2-NEXT: s_add_u32 s0, s0, 16 1519; GCN2-NEXT: s_addc_u32 s1, s1, 0 1520; GCN2-NEXT: v_mov_b32_e32 v0, s0 1521; GCN2-NEXT: v_mov_b32_e32 v1, s1 1522; GCN2-NEXT: v_mov_b32_e32 v2, s2 1523; GCN2-NEXT: flat_atomic_smax v[0:1], v2 1524; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1525; GCN2-NEXT: s_endpgm 1526; 1527; GCN3-LABEL: atomic_max_i32_offset: 1528; GCN3: ; %bb.0: ; %entry 1529; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1530; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 1531; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1532; GCN3-NEXT: v_mov_b32_e32 v0, s0 1533; GCN3-NEXT: v_mov_b32_e32 v1, s1 1534; GCN3-NEXT: v_mov_b32_e32 v2, s2 1535; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 1536; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1537; GCN3-NEXT: s_endpgm 1538entry: 1539 %gep = getelementptr i32, ptr %out, i32 4 1540 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst 1541 ret void 1542} 1543 1544define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 1545; GCN1-LABEL: atomic_max_i32_ret_offset: 1546; GCN1: ; %bb.0: ; %entry 1547; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1548; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 1549; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1550; GCN1-NEXT: s_add_u32 s0, s0, 16 1551; GCN1-NEXT: s_addc_u32 s1, s1, 0 1552; GCN1-NEXT: v_mov_b32_e32 v0, s0 1553; GCN1-NEXT: v_mov_b32_e32 v1, s1 1554; GCN1-NEXT: v_mov_b32_e32 v2, s4 1555; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1556; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1557; GCN1-NEXT: v_mov_b32_e32 v0, s2 1558; GCN1-NEXT: v_mov_b32_e32 v1, s3 1559; GCN1-NEXT: s_waitcnt vmcnt(0) 1560; GCN1-NEXT: flat_store_dword v[0:1], v2 1561; GCN1-NEXT: s_endpgm 1562; 1563; GCN2-LABEL: atomic_max_i32_ret_offset: 1564; GCN2: ; %bb.0: ; %entry 1565; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1566; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 1567; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1568; GCN2-NEXT: s_add_u32 s0, s0, 16 1569; GCN2-NEXT: s_addc_u32 s1, s1, 0 1570; GCN2-NEXT: v_mov_b32_e32 v0, s0 1571; GCN2-NEXT: v_mov_b32_e32 v1, s1 1572; GCN2-NEXT: v_mov_b32_e32 v2, s4 1573; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1574; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1575; GCN2-NEXT: v_mov_b32_e32 v0, s2 1576; GCN2-NEXT: v_mov_b32_e32 v1, s3 1577; GCN2-NEXT: s_waitcnt vmcnt(0) 1578; GCN2-NEXT: flat_store_dword v[0:1], v2 1579; GCN2-NEXT: s_endpgm 1580; 1581; GCN3-LABEL: atomic_max_i32_ret_offset: 1582; GCN3: ; %bb.0: ; %entry 1583; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1584; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 1585; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1586; GCN3-NEXT: v_mov_b32_e32 v0, s0 1587; GCN3-NEXT: v_mov_b32_e32 v1, s1 1588; GCN3-NEXT: v_mov_b32_e32 v2, s6 1589; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc 1590; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1591; GCN3-NEXT: v_mov_b32_e32 v0, s2 1592; GCN3-NEXT: v_mov_b32_e32 v1, s3 1593; GCN3-NEXT: s_waitcnt vmcnt(0) 1594; GCN3-NEXT: flat_store_dword v[0:1], v2 1595; GCN3-NEXT: s_endpgm 1596entry: 1597 %gep = getelementptr i32, ptr %out, i32 4 1598 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst 1599 store i32 %val, ptr %out2 1600 ret void 1601} 1602 1603define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 1604; GCN1-LABEL: atomic_max_i32_addr64_offset: 1605; GCN1: ; %bb.0: ; %entry 1606; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1607; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 1608; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 1609; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1610; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1611; GCN1-NEXT: s_add_u32 s0, s2, s0 1612; GCN1-NEXT: s_addc_u32 s1, s3, s1 1613; GCN1-NEXT: s_add_u32 s0, s0, 16 1614; GCN1-NEXT: s_addc_u32 s1, s1, 0 1615; GCN1-NEXT: v_mov_b32_e32 v0, s0 1616; GCN1-NEXT: v_mov_b32_e32 v1, s1 1617; GCN1-NEXT: v_mov_b32_e32 v2, s4 1618; GCN1-NEXT: flat_atomic_smax v[0:1], v2 1619; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1620; GCN1-NEXT: s_endpgm 1621; 1622; GCN2-LABEL: atomic_max_i32_addr64_offset: 1623; GCN2: ; %bb.0: ; %entry 1624; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1625; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1626; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 1627; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1628; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1629; GCN2-NEXT: s_add_u32 s0, s2, s0 1630; GCN2-NEXT: s_addc_u32 s1, s3, s1 1631; GCN2-NEXT: s_add_u32 s0, s0, 16 1632; GCN2-NEXT: s_addc_u32 s1, s1, 0 1633; GCN2-NEXT: v_mov_b32_e32 v0, s0 1634; GCN2-NEXT: v_mov_b32_e32 v1, s1 1635; GCN2-NEXT: v_mov_b32_e32 v2, s4 1636; GCN2-NEXT: flat_atomic_smax v[0:1], v2 1637; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1638; GCN2-NEXT: s_endpgm 1639; 1640; GCN3-LABEL: atomic_max_i32_addr64_offset: 1641; GCN3: ; %bb.0: ; %entry 1642; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1643; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1644; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 1645; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1646; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1647; GCN3-NEXT: s_add_u32 s0, s2, s0 1648; GCN3-NEXT: s_addc_u32 s1, s3, s1 1649; GCN3-NEXT: v_mov_b32_e32 v0, s0 1650; GCN3-NEXT: v_mov_b32_e32 v1, s1 1651; GCN3-NEXT: v_mov_b32_e32 v2, s6 1652; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 1653; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1654; GCN3-NEXT: s_endpgm 1655entry: 1656 %ptr = getelementptr i32, ptr %out, i64 %index 1657 %gep = getelementptr i32, ptr %ptr, i32 4 1658 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst 1659 ret void 1660} 1661 1662define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 1663; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: 1664; GCN1: ; %bb.0: ; %entry 1665; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 1666; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1667; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 1668; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1669; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1670; GCN1-NEXT: s_add_u32 s0, s0, s4 1671; GCN1-NEXT: s_addc_u32 s1, s1, s5 1672; GCN1-NEXT: s_add_u32 s0, s0, 16 1673; GCN1-NEXT: s_addc_u32 s1, s1, 0 1674; GCN1-NEXT: v_mov_b32_e32 v0, s0 1675; GCN1-NEXT: v_mov_b32_e32 v1, s1 1676; GCN1-NEXT: v_mov_b32_e32 v2, s8 1677; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1678; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1679; GCN1-NEXT: v_mov_b32_e32 v0, s2 1680; GCN1-NEXT: v_mov_b32_e32 v1, s3 1681; GCN1-NEXT: s_waitcnt vmcnt(0) 1682; GCN1-NEXT: flat_store_dword v[0:1], v2 1683; GCN1-NEXT: s_endpgm 1684; 1685; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: 1686; GCN2: ; %bb.0: ; %entry 1687; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1688; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1689; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 1690; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1691; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1692; GCN2-NEXT: s_add_u32 s0, s0, s4 1693; GCN2-NEXT: s_addc_u32 s1, s1, s5 1694; GCN2-NEXT: s_add_u32 s0, s0, 16 1695; GCN2-NEXT: s_addc_u32 s1, s1, 0 1696; GCN2-NEXT: v_mov_b32_e32 v0, s0 1697; GCN2-NEXT: v_mov_b32_e32 v1, s1 1698; GCN2-NEXT: v_mov_b32_e32 v2, s8 1699; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1700; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1701; GCN2-NEXT: v_mov_b32_e32 v0, s2 1702; GCN2-NEXT: v_mov_b32_e32 v1, s3 1703; GCN2-NEXT: s_waitcnt vmcnt(0) 1704; GCN2-NEXT: flat_store_dword v[0:1], v2 1705; GCN2-NEXT: s_endpgm 1706; 1707; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: 1708; GCN3: ; %bb.0: ; %entry 1709; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1710; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1711; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 1712; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1713; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1714; GCN3-NEXT: s_add_u32 s0, s0, s4 1715; GCN3-NEXT: s_addc_u32 s1, s1, s5 1716; GCN3-NEXT: v_mov_b32_e32 v0, s0 1717; GCN3-NEXT: v_mov_b32_e32 v1, s1 1718; GCN3-NEXT: v_mov_b32_e32 v2, s8 1719; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc 1720; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1721; GCN3-NEXT: v_mov_b32_e32 v0, s2 1722; GCN3-NEXT: v_mov_b32_e32 v1, s3 1723; GCN3-NEXT: s_waitcnt vmcnt(0) 1724; GCN3-NEXT: flat_store_dword v[0:1], v2 1725; GCN3-NEXT: s_endpgm 1726entry: 1727 %ptr = getelementptr i32, ptr %out, i64 %index 1728 %gep = getelementptr i32, ptr %ptr, i32 4 1729 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst 1730 store i32 %val, ptr %out2 1731 ret void 1732} 1733 1734define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { 1735; GCN1-LABEL: atomic_max_i32: 1736; GCN1: ; %bb.0: ; %entry 1737; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1738; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 1739; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1740; GCN1-NEXT: v_mov_b32_e32 v0, s0 1741; GCN1-NEXT: v_mov_b32_e32 v1, s1 1742; GCN1-NEXT: v_mov_b32_e32 v2, s2 1743; GCN1-NEXT: flat_atomic_smax v[0:1], v2 1744; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1745; GCN1-NEXT: s_endpgm 1746; 1747; GCN2-LABEL: atomic_max_i32: 1748; GCN2: ; %bb.0: ; %entry 1749; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1750; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 1751; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1752; GCN2-NEXT: v_mov_b32_e32 v0, s0 1753; GCN2-NEXT: v_mov_b32_e32 v1, s1 1754; GCN2-NEXT: v_mov_b32_e32 v2, s2 1755; GCN2-NEXT: flat_atomic_smax v[0:1], v2 1756; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1757; GCN2-NEXT: s_endpgm 1758; 1759; GCN3-LABEL: atomic_max_i32: 1760; GCN3: ; %bb.0: ; %entry 1761; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1762; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 1763; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1764; GCN3-NEXT: v_mov_b32_e32 v0, s0 1765; GCN3-NEXT: v_mov_b32_e32 v1, s1 1766; GCN3-NEXT: v_mov_b32_e32 v2, s2 1767; GCN3-NEXT: flat_atomic_smax v[0:1], v2 1768; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1769; GCN3-NEXT: s_endpgm 1770entry: 1771 %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst 1772 ret void 1773} 1774 1775define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { 1776; GCN1-LABEL: atomic_max_i32_ret: 1777; GCN1: ; %bb.0: ; %entry 1778; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1779; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 1780; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1781; GCN1-NEXT: v_mov_b32_e32 v0, s0 1782; GCN1-NEXT: v_mov_b32_e32 v1, s1 1783; GCN1-NEXT: v_mov_b32_e32 v2, s4 1784; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1785; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1786; GCN1-NEXT: v_mov_b32_e32 v0, s2 1787; GCN1-NEXT: v_mov_b32_e32 v1, s3 1788; GCN1-NEXT: s_waitcnt vmcnt(0) 1789; GCN1-NEXT: flat_store_dword v[0:1], v2 1790; GCN1-NEXT: s_endpgm 1791; 1792; GCN2-LABEL: atomic_max_i32_ret: 1793; GCN2: ; %bb.0: ; %entry 1794; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1795; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 1796; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1797; GCN2-NEXT: v_mov_b32_e32 v0, s0 1798; GCN2-NEXT: v_mov_b32_e32 v1, s1 1799; GCN2-NEXT: v_mov_b32_e32 v2, s4 1800; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1801; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1802; GCN2-NEXT: v_mov_b32_e32 v0, s2 1803; GCN2-NEXT: v_mov_b32_e32 v1, s3 1804; GCN2-NEXT: s_waitcnt vmcnt(0) 1805; GCN2-NEXT: flat_store_dword v[0:1], v2 1806; GCN2-NEXT: s_endpgm 1807; 1808; GCN3-LABEL: atomic_max_i32_ret: 1809; GCN3: ; %bb.0: ; %entry 1810; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1811; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 1812; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1813; GCN3-NEXT: v_mov_b32_e32 v0, s0 1814; GCN3-NEXT: v_mov_b32_e32 v1, s1 1815; GCN3-NEXT: v_mov_b32_e32 v2, s6 1816; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1817; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1818; GCN3-NEXT: v_mov_b32_e32 v0, s2 1819; GCN3-NEXT: v_mov_b32_e32 v1, s3 1820; GCN3-NEXT: s_waitcnt vmcnt(0) 1821; GCN3-NEXT: flat_store_dword v[0:1], v2 1822; GCN3-NEXT: s_endpgm 1823entry: 1824 %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst 1825 store i32 %val, ptr %out2 1826 ret void 1827} 1828 1829define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { 1830; GCN1-LABEL: atomic_max_i32_addr64: 1831; GCN1: ; %bb.0: ; %entry 1832; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1833; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 1834; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 1835; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1836; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1837; GCN1-NEXT: s_add_u32 s0, s2, s0 1838; GCN1-NEXT: s_addc_u32 s1, s3, s1 1839; GCN1-NEXT: v_mov_b32_e32 v0, s0 1840; GCN1-NEXT: v_mov_b32_e32 v1, s1 1841; GCN1-NEXT: v_mov_b32_e32 v2, s4 1842; GCN1-NEXT: flat_atomic_smax v[0:1], v2 1843; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1844; GCN1-NEXT: s_endpgm 1845; 1846; GCN2-LABEL: atomic_max_i32_addr64: 1847; GCN2: ; %bb.0: ; %entry 1848; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1849; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1850; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 1851; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1852; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1853; GCN2-NEXT: s_add_u32 s0, s2, s0 1854; GCN2-NEXT: s_addc_u32 s1, s3, s1 1855; GCN2-NEXT: v_mov_b32_e32 v0, s0 1856; GCN2-NEXT: v_mov_b32_e32 v1, s1 1857; GCN2-NEXT: v_mov_b32_e32 v2, s4 1858; GCN2-NEXT: flat_atomic_smax v[0:1], v2 1859; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1860; GCN2-NEXT: s_endpgm 1861; 1862; GCN3-LABEL: atomic_max_i32_addr64: 1863; GCN3: ; %bb.0: ; %entry 1864; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1865; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1866; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 1867; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1868; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 1869; GCN3-NEXT: s_add_u32 s0, s2, s0 1870; GCN3-NEXT: s_addc_u32 s1, s3, s1 1871; GCN3-NEXT: v_mov_b32_e32 v0, s0 1872; GCN3-NEXT: v_mov_b32_e32 v1, s1 1873; GCN3-NEXT: v_mov_b32_e32 v2, s6 1874; GCN3-NEXT: flat_atomic_smax v[0:1], v2 1875; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1876; GCN3-NEXT: s_endpgm 1877entry: 1878 %ptr = getelementptr i32, ptr %out, i64 %index 1879 %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst 1880 ret void 1881} 1882 1883define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 1884; GCN1-LABEL: atomic_max_i32_ret_addr64: 1885; GCN1: ; %bb.0: ; %entry 1886; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 1887; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1888; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 1889; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1890; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1891; GCN1-NEXT: s_add_u32 s0, s0, s4 1892; GCN1-NEXT: s_addc_u32 s1, s1, s5 1893; GCN1-NEXT: v_mov_b32_e32 v0, s0 1894; GCN1-NEXT: v_mov_b32_e32 v1, s1 1895; GCN1-NEXT: v_mov_b32_e32 v2, s8 1896; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1897; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1898; GCN1-NEXT: v_mov_b32_e32 v0, s2 1899; GCN1-NEXT: v_mov_b32_e32 v1, s3 1900; GCN1-NEXT: s_waitcnt vmcnt(0) 1901; GCN1-NEXT: flat_store_dword v[0:1], v2 1902; GCN1-NEXT: s_endpgm 1903; 1904; GCN2-LABEL: atomic_max_i32_ret_addr64: 1905; GCN2: ; %bb.0: ; %entry 1906; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1907; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1908; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 1909; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1910; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1911; GCN2-NEXT: s_add_u32 s0, s0, s4 1912; GCN2-NEXT: s_addc_u32 s1, s1, s5 1913; GCN2-NEXT: v_mov_b32_e32 v0, s0 1914; GCN2-NEXT: v_mov_b32_e32 v1, s1 1915; GCN2-NEXT: v_mov_b32_e32 v2, s8 1916; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1917; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1918; GCN2-NEXT: v_mov_b32_e32 v0, s2 1919; GCN2-NEXT: v_mov_b32_e32 v1, s3 1920; GCN2-NEXT: s_waitcnt vmcnt(0) 1921; GCN2-NEXT: flat_store_dword v[0:1], v2 1922; GCN2-NEXT: s_endpgm 1923; 1924; GCN3-LABEL: atomic_max_i32_ret_addr64: 1925; GCN3: ; %bb.0: ; %entry 1926; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 1927; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1928; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 1929; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1930; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 1931; GCN3-NEXT: s_add_u32 s0, s0, s4 1932; GCN3-NEXT: s_addc_u32 s1, s1, s5 1933; GCN3-NEXT: v_mov_b32_e32 v0, s0 1934; GCN3-NEXT: v_mov_b32_e32 v1, s1 1935; GCN3-NEXT: v_mov_b32_e32 v2, s8 1936; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc 1937; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1938; GCN3-NEXT: v_mov_b32_e32 v0, s2 1939; GCN3-NEXT: v_mov_b32_e32 v1, s3 1940; GCN3-NEXT: s_waitcnt vmcnt(0) 1941; GCN3-NEXT: flat_store_dword v[0:1], v2 1942; GCN3-NEXT: s_endpgm 1943entry: 1944 %ptr = getelementptr i32, ptr %out, i64 %index 1945 %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst 1946 store i32 %val, ptr %out2 1947 ret void 1948} 1949 1950define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { 1951; GCN1-LABEL: atomic_umax_i32_offset: 1952; GCN1: ; %bb.0: ; %entry 1953; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1954; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 1955; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1956; GCN1-NEXT: s_add_u32 s0, s0, 16 1957; GCN1-NEXT: s_addc_u32 s1, s1, 0 1958; GCN1-NEXT: v_mov_b32_e32 v0, s0 1959; GCN1-NEXT: v_mov_b32_e32 v1, s1 1960; GCN1-NEXT: v_mov_b32_e32 v2, s2 1961; GCN1-NEXT: flat_atomic_umax v[0:1], v2 1962; GCN1-NEXT: s_waitcnt lgkmcnt(0) 1963; GCN1-NEXT: s_endpgm 1964; 1965; GCN2-LABEL: atomic_umax_i32_offset: 1966; GCN2: ; %bb.0: ; %entry 1967; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1968; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 1969; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1970; GCN2-NEXT: s_add_u32 s0, s0, 16 1971; GCN2-NEXT: s_addc_u32 s1, s1, 0 1972; GCN2-NEXT: v_mov_b32_e32 v0, s0 1973; GCN2-NEXT: v_mov_b32_e32 v1, s1 1974; GCN2-NEXT: v_mov_b32_e32 v2, s2 1975; GCN2-NEXT: flat_atomic_umax v[0:1], v2 1976; GCN2-NEXT: s_waitcnt lgkmcnt(0) 1977; GCN2-NEXT: s_endpgm 1978; 1979; GCN3-LABEL: atomic_umax_i32_offset: 1980; GCN3: ; %bb.0: ; %entry 1981; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1982; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 1983; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1984; GCN3-NEXT: v_mov_b32_e32 v0, s0 1985; GCN3-NEXT: v_mov_b32_e32 v1, s1 1986; GCN3-NEXT: v_mov_b32_e32 v2, s2 1987; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 1988; GCN3-NEXT: s_waitcnt lgkmcnt(0) 1989; GCN3-NEXT: s_endpgm 1990entry: 1991 %gep = getelementptr i32, ptr %out, i32 4 1992 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst 1993 ret void 1994} 1995 1996define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 1997; GCN1-LABEL: atomic_umax_i32_ret_offset: 1998; GCN1: ; %bb.0: ; %entry 1999; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2000; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 2001; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2002; GCN1-NEXT: s_add_u32 s0, s0, 16 2003; GCN1-NEXT: s_addc_u32 s1, s1, 0 2004; GCN1-NEXT: v_mov_b32_e32 v0, s0 2005; GCN1-NEXT: v_mov_b32_e32 v1, s1 2006; GCN1-NEXT: v_mov_b32_e32 v2, s4 2007; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2008; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2009; GCN1-NEXT: v_mov_b32_e32 v0, s2 2010; GCN1-NEXT: v_mov_b32_e32 v1, s3 2011; GCN1-NEXT: s_waitcnt vmcnt(0) 2012; GCN1-NEXT: flat_store_dword v[0:1], v2 2013; GCN1-NEXT: s_endpgm 2014; 2015; GCN2-LABEL: atomic_umax_i32_ret_offset: 2016; GCN2: ; %bb.0: ; %entry 2017; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2018; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 2019; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2020; GCN2-NEXT: s_add_u32 s0, s0, 16 2021; GCN2-NEXT: s_addc_u32 s1, s1, 0 2022; GCN2-NEXT: v_mov_b32_e32 v0, s0 2023; GCN2-NEXT: v_mov_b32_e32 v1, s1 2024; GCN2-NEXT: v_mov_b32_e32 v2, s4 2025; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2026; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2027; GCN2-NEXT: v_mov_b32_e32 v0, s2 2028; GCN2-NEXT: v_mov_b32_e32 v1, s3 2029; GCN2-NEXT: s_waitcnt vmcnt(0) 2030; GCN2-NEXT: flat_store_dword v[0:1], v2 2031; GCN2-NEXT: s_endpgm 2032; 2033; GCN3-LABEL: atomic_umax_i32_ret_offset: 2034; GCN3: ; %bb.0: ; %entry 2035; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2036; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 2037; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2038; GCN3-NEXT: v_mov_b32_e32 v0, s0 2039; GCN3-NEXT: v_mov_b32_e32 v1, s1 2040; GCN3-NEXT: v_mov_b32_e32 v2, s6 2041; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc 2042; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2043; GCN3-NEXT: v_mov_b32_e32 v0, s2 2044; GCN3-NEXT: v_mov_b32_e32 v1, s3 2045; GCN3-NEXT: s_waitcnt vmcnt(0) 2046; GCN3-NEXT: flat_store_dword v[0:1], v2 2047; GCN3-NEXT: s_endpgm 2048entry: 2049 %gep = getelementptr i32, ptr %out, i32 4 2050 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst 2051 store i32 %val, ptr %out2 2052 ret void 2053} 2054 2055define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 2056; GCN1-LABEL: atomic_umax_i32_addr64_offset: 2057; GCN1: ; %bb.0: ; %entry 2058; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2059; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 2060; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 2061; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2062; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2063; GCN1-NEXT: s_add_u32 s0, s2, s0 2064; GCN1-NEXT: s_addc_u32 s1, s3, s1 2065; GCN1-NEXT: s_add_u32 s0, s0, 16 2066; GCN1-NEXT: s_addc_u32 s1, s1, 0 2067; GCN1-NEXT: v_mov_b32_e32 v0, s0 2068; GCN1-NEXT: v_mov_b32_e32 v1, s1 2069; GCN1-NEXT: v_mov_b32_e32 v2, s4 2070; GCN1-NEXT: flat_atomic_umax v[0:1], v2 2071; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2072; GCN1-NEXT: s_endpgm 2073; 2074; GCN2-LABEL: atomic_umax_i32_addr64_offset: 2075; GCN2: ; %bb.0: ; %entry 2076; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2077; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2078; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 2079; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2080; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2081; GCN2-NEXT: s_add_u32 s0, s2, s0 2082; GCN2-NEXT: s_addc_u32 s1, s3, s1 2083; GCN2-NEXT: s_add_u32 s0, s0, 16 2084; GCN2-NEXT: s_addc_u32 s1, s1, 0 2085; GCN2-NEXT: v_mov_b32_e32 v0, s0 2086; GCN2-NEXT: v_mov_b32_e32 v1, s1 2087; GCN2-NEXT: v_mov_b32_e32 v2, s4 2088; GCN2-NEXT: flat_atomic_umax v[0:1], v2 2089; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2090; GCN2-NEXT: s_endpgm 2091; 2092; GCN3-LABEL: atomic_umax_i32_addr64_offset: 2093; GCN3: ; %bb.0: ; %entry 2094; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2095; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2096; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 2097; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2098; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2099; GCN3-NEXT: s_add_u32 s0, s2, s0 2100; GCN3-NEXT: s_addc_u32 s1, s3, s1 2101; GCN3-NEXT: v_mov_b32_e32 v0, s0 2102; GCN3-NEXT: v_mov_b32_e32 v1, s1 2103; GCN3-NEXT: v_mov_b32_e32 v2, s6 2104; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 2105; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2106; GCN3-NEXT: s_endpgm 2107entry: 2108 %ptr = getelementptr i32, ptr %out, i64 %index 2109 %gep = getelementptr i32, ptr %ptr, i32 4 2110 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst 2111 ret void 2112} 2113 2114define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 2115; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: 2116; GCN1: ; %bb.0: ; %entry 2117; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 2118; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2119; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 2120; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2121; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2122; GCN1-NEXT: s_add_u32 s0, s0, s4 2123; GCN1-NEXT: s_addc_u32 s1, s1, s5 2124; GCN1-NEXT: s_add_u32 s0, s0, 16 2125; GCN1-NEXT: s_addc_u32 s1, s1, 0 2126; GCN1-NEXT: v_mov_b32_e32 v0, s0 2127; GCN1-NEXT: v_mov_b32_e32 v1, s1 2128; GCN1-NEXT: v_mov_b32_e32 v2, s8 2129; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2130; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2131; GCN1-NEXT: v_mov_b32_e32 v0, s2 2132; GCN1-NEXT: v_mov_b32_e32 v1, s3 2133; GCN1-NEXT: s_waitcnt vmcnt(0) 2134; GCN1-NEXT: flat_store_dword v[0:1], v2 2135; GCN1-NEXT: s_endpgm 2136; 2137; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: 2138; GCN2: ; %bb.0: ; %entry 2139; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2140; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2141; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 2142; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2143; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2144; GCN2-NEXT: s_add_u32 s0, s0, s4 2145; GCN2-NEXT: s_addc_u32 s1, s1, s5 2146; GCN2-NEXT: s_add_u32 s0, s0, 16 2147; GCN2-NEXT: s_addc_u32 s1, s1, 0 2148; GCN2-NEXT: v_mov_b32_e32 v0, s0 2149; GCN2-NEXT: v_mov_b32_e32 v1, s1 2150; GCN2-NEXT: v_mov_b32_e32 v2, s8 2151; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2152; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2153; GCN2-NEXT: v_mov_b32_e32 v0, s2 2154; GCN2-NEXT: v_mov_b32_e32 v1, s3 2155; GCN2-NEXT: s_waitcnt vmcnt(0) 2156; GCN2-NEXT: flat_store_dword v[0:1], v2 2157; GCN2-NEXT: s_endpgm 2158; 2159; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: 2160; GCN3: ; %bb.0: ; %entry 2161; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2162; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2163; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 2164; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2165; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2166; GCN3-NEXT: s_add_u32 s0, s0, s4 2167; GCN3-NEXT: s_addc_u32 s1, s1, s5 2168; GCN3-NEXT: v_mov_b32_e32 v0, s0 2169; GCN3-NEXT: v_mov_b32_e32 v1, s1 2170; GCN3-NEXT: v_mov_b32_e32 v2, s8 2171; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc 2172; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2173; GCN3-NEXT: v_mov_b32_e32 v0, s2 2174; GCN3-NEXT: v_mov_b32_e32 v1, s3 2175; GCN3-NEXT: s_waitcnt vmcnt(0) 2176; GCN3-NEXT: flat_store_dword v[0:1], v2 2177; GCN3-NEXT: s_endpgm 2178entry: 2179 %ptr = getelementptr i32, ptr %out, i64 %index 2180 %gep = getelementptr i32, ptr %ptr, i32 4 2181 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst 2182 store i32 %val, ptr %out2 2183 ret void 2184} 2185 2186define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { 2187; GCN1-LABEL: atomic_umax_i32: 2188; GCN1: ; %bb.0: ; %entry 2189; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2190; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 2191; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2192; GCN1-NEXT: v_mov_b32_e32 v0, s0 2193; GCN1-NEXT: v_mov_b32_e32 v1, s1 2194; GCN1-NEXT: v_mov_b32_e32 v2, s2 2195; GCN1-NEXT: flat_atomic_umax v[0:1], v2 2196; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2197; GCN1-NEXT: s_endpgm 2198; 2199; GCN2-LABEL: atomic_umax_i32: 2200; GCN2: ; %bb.0: ; %entry 2201; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2202; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 2203; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2204; GCN2-NEXT: v_mov_b32_e32 v0, s0 2205; GCN2-NEXT: v_mov_b32_e32 v1, s1 2206; GCN2-NEXT: v_mov_b32_e32 v2, s2 2207; GCN2-NEXT: flat_atomic_umax v[0:1], v2 2208; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2209; GCN2-NEXT: s_endpgm 2210; 2211; GCN3-LABEL: atomic_umax_i32: 2212; GCN3: ; %bb.0: ; %entry 2213; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2214; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 2215; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2216; GCN3-NEXT: v_mov_b32_e32 v0, s0 2217; GCN3-NEXT: v_mov_b32_e32 v1, s1 2218; GCN3-NEXT: v_mov_b32_e32 v2, s2 2219; GCN3-NEXT: flat_atomic_umax v[0:1], v2 2220; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2221; GCN3-NEXT: s_endpgm 2222entry: 2223 %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst 2224 ret void 2225} 2226 2227define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { 2228; GCN1-LABEL: atomic_umax_i32_ret: 2229; GCN1: ; %bb.0: ; %entry 2230; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2231; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 2232; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2233; GCN1-NEXT: v_mov_b32_e32 v0, s0 2234; GCN1-NEXT: v_mov_b32_e32 v1, s1 2235; GCN1-NEXT: v_mov_b32_e32 v2, s4 2236; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2237; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2238; GCN1-NEXT: v_mov_b32_e32 v0, s2 2239; GCN1-NEXT: v_mov_b32_e32 v1, s3 2240; GCN1-NEXT: s_waitcnt vmcnt(0) 2241; GCN1-NEXT: flat_store_dword v[0:1], v2 2242; GCN1-NEXT: s_endpgm 2243; 2244; GCN2-LABEL: atomic_umax_i32_ret: 2245; GCN2: ; %bb.0: ; %entry 2246; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2247; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 2248; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2249; GCN2-NEXT: v_mov_b32_e32 v0, s0 2250; GCN2-NEXT: v_mov_b32_e32 v1, s1 2251; GCN2-NEXT: v_mov_b32_e32 v2, s4 2252; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2253; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2254; GCN2-NEXT: v_mov_b32_e32 v0, s2 2255; GCN2-NEXT: v_mov_b32_e32 v1, s3 2256; GCN2-NEXT: s_waitcnt vmcnt(0) 2257; GCN2-NEXT: flat_store_dword v[0:1], v2 2258; GCN2-NEXT: s_endpgm 2259; 2260; GCN3-LABEL: atomic_umax_i32_ret: 2261; GCN3: ; %bb.0: ; %entry 2262; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2263; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 2264; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2265; GCN3-NEXT: v_mov_b32_e32 v0, s0 2266; GCN3-NEXT: v_mov_b32_e32 v1, s1 2267; GCN3-NEXT: v_mov_b32_e32 v2, s6 2268; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2269; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2270; GCN3-NEXT: v_mov_b32_e32 v0, s2 2271; GCN3-NEXT: v_mov_b32_e32 v1, s3 2272; GCN3-NEXT: s_waitcnt vmcnt(0) 2273; GCN3-NEXT: flat_store_dword v[0:1], v2 2274; GCN3-NEXT: s_endpgm 2275entry: 2276 %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst 2277 store i32 %val, ptr %out2 2278 ret void 2279} 2280 2281define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { 2282; GCN1-LABEL: atomic_umax_i32_addr64: 2283; GCN1: ; %bb.0: ; %entry 2284; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2285; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 2286; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 2287; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2288; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2289; GCN1-NEXT: s_add_u32 s0, s2, s0 2290; GCN1-NEXT: s_addc_u32 s1, s3, s1 2291; GCN1-NEXT: v_mov_b32_e32 v0, s0 2292; GCN1-NEXT: v_mov_b32_e32 v1, s1 2293; GCN1-NEXT: v_mov_b32_e32 v2, s4 2294; GCN1-NEXT: flat_atomic_umax v[0:1], v2 2295; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2296; GCN1-NEXT: s_endpgm 2297; 2298; GCN2-LABEL: atomic_umax_i32_addr64: 2299; GCN2: ; %bb.0: ; %entry 2300; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2301; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2302; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 2303; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2304; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2305; GCN2-NEXT: s_add_u32 s0, s2, s0 2306; GCN2-NEXT: s_addc_u32 s1, s3, s1 2307; GCN2-NEXT: v_mov_b32_e32 v0, s0 2308; GCN2-NEXT: v_mov_b32_e32 v1, s1 2309; GCN2-NEXT: v_mov_b32_e32 v2, s4 2310; GCN2-NEXT: flat_atomic_umax v[0:1], v2 2311; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2312; GCN2-NEXT: s_endpgm 2313; 2314; GCN3-LABEL: atomic_umax_i32_addr64: 2315; GCN3: ; %bb.0: ; %entry 2316; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2317; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2318; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 2319; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2320; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2321; GCN3-NEXT: s_add_u32 s0, s2, s0 2322; GCN3-NEXT: s_addc_u32 s1, s3, s1 2323; GCN3-NEXT: v_mov_b32_e32 v0, s0 2324; GCN3-NEXT: v_mov_b32_e32 v1, s1 2325; GCN3-NEXT: v_mov_b32_e32 v2, s6 2326; GCN3-NEXT: flat_atomic_umax v[0:1], v2 2327; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2328; GCN3-NEXT: s_endpgm 2329entry: 2330 %ptr = getelementptr i32, ptr %out, i64 %index 2331 %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst 2332 ret void 2333} 2334 2335define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 2336; GCN1-LABEL: atomic_umax_i32_ret_addr64: 2337; GCN1: ; %bb.0: ; %entry 2338; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 2339; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2340; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 2341; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2342; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2343; GCN1-NEXT: s_add_u32 s0, s0, s4 2344; GCN1-NEXT: s_addc_u32 s1, s1, s5 2345; GCN1-NEXT: v_mov_b32_e32 v0, s0 2346; GCN1-NEXT: v_mov_b32_e32 v1, s1 2347; GCN1-NEXT: v_mov_b32_e32 v2, s8 2348; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2349; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2350; GCN1-NEXT: v_mov_b32_e32 v0, s2 2351; GCN1-NEXT: v_mov_b32_e32 v1, s3 2352; GCN1-NEXT: s_waitcnt vmcnt(0) 2353; GCN1-NEXT: flat_store_dword v[0:1], v2 2354; GCN1-NEXT: s_endpgm 2355; 2356; GCN2-LABEL: atomic_umax_i32_ret_addr64: 2357; GCN2: ; %bb.0: ; %entry 2358; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2359; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2360; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 2361; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2362; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2363; GCN2-NEXT: s_add_u32 s0, s0, s4 2364; GCN2-NEXT: s_addc_u32 s1, s1, s5 2365; GCN2-NEXT: v_mov_b32_e32 v0, s0 2366; GCN2-NEXT: v_mov_b32_e32 v1, s1 2367; GCN2-NEXT: v_mov_b32_e32 v2, s8 2368; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2369; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2370; GCN2-NEXT: v_mov_b32_e32 v0, s2 2371; GCN2-NEXT: v_mov_b32_e32 v1, s3 2372; GCN2-NEXT: s_waitcnt vmcnt(0) 2373; GCN2-NEXT: flat_store_dword v[0:1], v2 2374; GCN2-NEXT: s_endpgm 2375; 2376; GCN3-LABEL: atomic_umax_i32_ret_addr64: 2377; GCN3: ; %bb.0: ; %entry 2378; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2379; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2380; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 2381; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2382; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2383; GCN3-NEXT: s_add_u32 s0, s0, s4 2384; GCN3-NEXT: s_addc_u32 s1, s1, s5 2385; GCN3-NEXT: v_mov_b32_e32 v0, s0 2386; GCN3-NEXT: v_mov_b32_e32 v1, s1 2387; GCN3-NEXT: v_mov_b32_e32 v2, s8 2388; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc 2389; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2390; GCN3-NEXT: v_mov_b32_e32 v0, s2 2391; GCN3-NEXT: v_mov_b32_e32 v1, s3 2392; GCN3-NEXT: s_waitcnt vmcnt(0) 2393; GCN3-NEXT: flat_store_dword v[0:1], v2 2394; GCN3-NEXT: s_endpgm 2395entry: 2396 %ptr = getelementptr i32, ptr %out, i64 %index 2397 %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst 2398 store i32 %val, ptr %out2 2399 ret void 2400} 2401 2402define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { 2403; GCN1-LABEL: atomic_min_i32_offset: 2404; GCN1: ; %bb.0: ; %entry 2405; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2406; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 2407; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2408; GCN1-NEXT: s_add_u32 s0, s0, 16 2409; GCN1-NEXT: s_addc_u32 s1, s1, 0 2410; GCN1-NEXT: v_mov_b32_e32 v0, s0 2411; GCN1-NEXT: v_mov_b32_e32 v1, s1 2412; GCN1-NEXT: v_mov_b32_e32 v2, s2 2413; GCN1-NEXT: flat_atomic_smin v[0:1], v2 2414; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2415; GCN1-NEXT: s_endpgm 2416; 2417; GCN2-LABEL: atomic_min_i32_offset: 2418; GCN2: ; %bb.0: ; %entry 2419; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2420; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 2421; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2422; GCN2-NEXT: s_add_u32 s0, s0, 16 2423; GCN2-NEXT: s_addc_u32 s1, s1, 0 2424; GCN2-NEXT: v_mov_b32_e32 v0, s0 2425; GCN2-NEXT: v_mov_b32_e32 v1, s1 2426; GCN2-NEXT: v_mov_b32_e32 v2, s2 2427; GCN2-NEXT: flat_atomic_smin v[0:1], v2 2428; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2429; GCN2-NEXT: s_endpgm 2430; 2431; GCN3-LABEL: atomic_min_i32_offset: 2432; GCN3: ; %bb.0: ; %entry 2433; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2434; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 2435; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2436; GCN3-NEXT: v_mov_b32_e32 v0, s0 2437; GCN3-NEXT: v_mov_b32_e32 v1, s1 2438; GCN3-NEXT: v_mov_b32_e32 v2, s2 2439; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 2440; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2441; GCN3-NEXT: s_endpgm 2442entry: 2443 %gep = getelementptr i32, ptr %out, i32 4 2444 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst 2445 ret void 2446} 2447 2448define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 2449; GCN1-LABEL: atomic_min_i32_ret_offset: 2450; GCN1: ; %bb.0: ; %entry 2451; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2452; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 2453; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2454; GCN1-NEXT: s_add_u32 s0, s0, 16 2455; GCN1-NEXT: s_addc_u32 s1, s1, 0 2456; GCN1-NEXT: v_mov_b32_e32 v0, s0 2457; GCN1-NEXT: v_mov_b32_e32 v1, s1 2458; GCN1-NEXT: v_mov_b32_e32 v2, s4 2459; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2460; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2461; GCN1-NEXT: v_mov_b32_e32 v0, s2 2462; GCN1-NEXT: v_mov_b32_e32 v1, s3 2463; GCN1-NEXT: s_waitcnt vmcnt(0) 2464; GCN1-NEXT: flat_store_dword v[0:1], v2 2465; GCN1-NEXT: s_endpgm 2466; 2467; GCN2-LABEL: atomic_min_i32_ret_offset: 2468; GCN2: ; %bb.0: ; %entry 2469; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2470; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 2471; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2472; GCN2-NEXT: s_add_u32 s0, s0, 16 2473; GCN2-NEXT: s_addc_u32 s1, s1, 0 2474; GCN2-NEXT: v_mov_b32_e32 v0, s0 2475; GCN2-NEXT: v_mov_b32_e32 v1, s1 2476; GCN2-NEXT: v_mov_b32_e32 v2, s4 2477; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2478; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2479; GCN2-NEXT: v_mov_b32_e32 v0, s2 2480; GCN2-NEXT: v_mov_b32_e32 v1, s3 2481; GCN2-NEXT: s_waitcnt vmcnt(0) 2482; GCN2-NEXT: flat_store_dword v[0:1], v2 2483; GCN2-NEXT: s_endpgm 2484; 2485; GCN3-LABEL: atomic_min_i32_ret_offset: 2486; GCN3: ; %bb.0: ; %entry 2487; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2488; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 2489; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2490; GCN3-NEXT: v_mov_b32_e32 v0, s0 2491; GCN3-NEXT: v_mov_b32_e32 v1, s1 2492; GCN3-NEXT: v_mov_b32_e32 v2, s6 2493; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc 2494; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2495; GCN3-NEXT: v_mov_b32_e32 v0, s2 2496; GCN3-NEXT: v_mov_b32_e32 v1, s3 2497; GCN3-NEXT: s_waitcnt vmcnt(0) 2498; GCN3-NEXT: flat_store_dword v[0:1], v2 2499; GCN3-NEXT: s_endpgm 2500entry: 2501 %gep = getelementptr i32, ptr %out, i32 4 2502 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst 2503 store i32 %val, ptr %out2 2504 ret void 2505} 2506 2507define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 2508; GCN1-LABEL: atomic_min_i32_addr64_offset: 2509; GCN1: ; %bb.0: ; %entry 2510; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2511; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 2512; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 2513; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2514; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2515; GCN1-NEXT: s_add_u32 s0, s2, s0 2516; GCN1-NEXT: s_addc_u32 s1, s3, s1 2517; GCN1-NEXT: s_add_u32 s0, s0, 16 2518; GCN1-NEXT: s_addc_u32 s1, s1, 0 2519; GCN1-NEXT: v_mov_b32_e32 v0, s0 2520; GCN1-NEXT: v_mov_b32_e32 v1, s1 2521; GCN1-NEXT: v_mov_b32_e32 v2, s4 2522; GCN1-NEXT: flat_atomic_smin v[0:1], v2 2523; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2524; GCN1-NEXT: s_endpgm 2525; 2526; GCN2-LABEL: atomic_min_i32_addr64_offset: 2527; GCN2: ; %bb.0: ; %entry 2528; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2529; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2530; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 2531; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2532; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2533; GCN2-NEXT: s_add_u32 s0, s2, s0 2534; GCN2-NEXT: s_addc_u32 s1, s3, s1 2535; GCN2-NEXT: s_add_u32 s0, s0, 16 2536; GCN2-NEXT: s_addc_u32 s1, s1, 0 2537; GCN2-NEXT: v_mov_b32_e32 v0, s0 2538; GCN2-NEXT: v_mov_b32_e32 v1, s1 2539; GCN2-NEXT: v_mov_b32_e32 v2, s4 2540; GCN2-NEXT: flat_atomic_smin v[0:1], v2 2541; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2542; GCN2-NEXT: s_endpgm 2543; 2544; GCN3-LABEL: atomic_min_i32_addr64_offset: 2545; GCN3: ; %bb.0: ; %entry 2546; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2547; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2548; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 2549; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2550; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2551; GCN3-NEXT: s_add_u32 s0, s2, s0 2552; GCN3-NEXT: s_addc_u32 s1, s3, s1 2553; GCN3-NEXT: v_mov_b32_e32 v0, s0 2554; GCN3-NEXT: v_mov_b32_e32 v1, s1 2555; GCN3-NEXT: v_mov_b32_e32 v2, s6 2556; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 2557; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2558; GCN3-NEXT: s_endpgm 2559entry: 2560 %ptr = getelementptr i32, ptr %out, i64 %index 2561 %gep = getelementptr i32, ptr %ptr, i32 4 2562 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst 2563 ret void 2564} 2565 2566define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 2567; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: 2568; GCN1: ; %bb.0: ; %entry 2569; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 2570; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2571; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 2572; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2573; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2574; GCN1-NEXT: s_add_u32 s0, s0, s4 2575; GCN1-NEXT: s_addc_u32 s1, s1, s5 2576; GCN1-NEXT: s_add_u32 s0, s0, 16 2577; GCN1-NEXT: s_addc_u32 s1, s1, 0 2578; GCN1-NEXT: v_mov_b32_e32 v0, s0 2579; GCN1-NEXT: v_mov_b32_e32 v1, s1 2580; GCN1-NEXT: v_mov_b32_e32 v2, s8 2581; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2582; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2583; GCN1-NEXT: v_mov_b32_e32 v0, s2 2584; GCN1-NEXT: v_mov_b32_e32 v1, s3 2585; GCN1-NEXT: s_waitcnt vmcnt(0) 2586; GCN1-NEXT: flat_store_dword v[0:1], v2 2587; GCN1-NEXT: s_endpgm 2588; 2589; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: 2590; GCN2: ; %bb.0: ; %entry 2591; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2592; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2593; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 2594; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2595; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2596; GCN2-NEXT: s_add_u32 s0, s0, s4 2597; GCN2-NEXT: s_addc_u32 s1, s1, s5 2598; GCN2-NEXT: s_add_u32 s0, s0, 16 2599; GCN2-NEXT: s_addc_u32 s1, s1, 0 2600; GCN2-NEXT: v_mov_b32_e32 v0, s0 2601; GCN2-NEXT: v_mov_b32_e32 v1, s1 2602; GCN2-NEXT: v_mov_b32_e32 v2, s8 2603; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2604; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2605; GCN2-NEXT: v_mov_b32_e32 v0, s2 2606; GCN2-NEXT: v_mov_b32_e32 v1, s3 2607; GCN2-NEXT: s_waitcnt vmcnt(0) 2608; GCN2-NEXT: flat_store_dword v[0:1], v2 2609; GCN2-NEXT: s_endpgm 2610; 2611; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: 2612; GCN3: ; %bb.0: ; %entry 2613; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2614; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2615; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 2616; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2617; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2618; GCN3-NEXT: s_add_u32 s0, s0, s4 2619; GCN3-NEXT: s_addc_u32 s1, s1, s5 2620; GCN3-NEXT: v_mov_b32_e32 v0, s0 2621; GCN3-NEXT: v_mov_b32_e32 v1, s1 2622; GCN3-NEXT: v_mov_b32_e32 v2, s8 2623; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc 2624; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2625; GCN3-NEXT: v_mov_b32_e32 v0, s2 2626; GCN3-NEXT: v_mov_b32_e32 v1, s3 2627; GCN3-NEXT: s_waitcnt vmcnt(0) 2628; GCN3-NEXT: flat_store_dword v[0:1], v2 2629; GCN3-NEXT: s_endpgm 2630entry: 2631 %ptr = getelementptr i32, ptr %out, i64 %index 2632 %gep = getelementptr i32, ptr %ptr, i32 4 2633 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst 2634 store i32 %val, ptr %out2 2635 ret void 2636} 2637 2638define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { 2639; GCN1-LABEL: atomic_min_i32: 2640; GCN1: ; %bb.0: ; %entry 2641; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2642; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 2643; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2644; GCN1-NEXT: v_mov_b32_e32 v0, s0 2645; GCN1-NEXT: v_mov_b32_e32 v1, s1 2646; GCN1-NEXT: v_mov_b32_e32 v2, s2 2647; GCN1-NEXT: flat_atomic_smin v[0:1], v2 2648; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2649; GCN1-NEXT: s_endpgm 2650; 2651; GCN2-LABEL: atomic_min_i32: 2652; GCN2: ; %bb.0: ; %entry 2653; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2654; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 2655; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2656; GCN2-NEXT: v_mov_b32_e32 v0, s0 2657; GCN2-NEXT: v_mov_b32_e32 v1, s1 2658; GCN2-NEXT: v_mov_b32_e32 v2, s2 2659; GCN2-NEXT: flat_atomic_smin v[0:1], v2 2660; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2661; GCN2-NEXT: s_endpgm 2662; 2663; GCN3-LABEL: atomic_min_i32: 2664; GCN3: ; %bb.0: ; %entry 2665; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2666; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 2667; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2668; GCN3-NEXT: v_mov_b32_e32 v0, s0 2669; GCN3-NEXT: v_mov_b32_e32 v1, s1 2670; GCN3-NEXT: v_mov_b32_e32 v2, s2 2671; GCN3-NEXT: flat_atomic_smin v[0:1], v2 2672; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2673; GCN3-NEXT: s_endpgm 2674entry: 2675 %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst 2676 ret void 2677} 2678 2679define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { 2680; GCN1-LABEL: atomic_min_i32_ret: 2681; GCN1: ; %bb.0: ; %entry 2682; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2683; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 2684; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2685; GCN1-NEXT: v_mov_b32_e32 v0, s0 2686; GCN1-NEXT: v_mov_b32_e32 v1, s1 2687; GCN1-NEXT: v_mov_b32_e32 v2, s4 2688; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2689; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2690; GCN1-NEXT: v_mov_b32_e32 v0, s2 2691; GCN1-NEXT: v_mov_b32_e32 v1, s3 2692; GCN1-NEXT: s_waitcnt vmcnt(0) 2693; GCN1-NEXT: flat_store_dword v[0:1], v2 2694; GCN1-NEXT: s_endpgm 2695; 2696; GCN2-LABEL: atomic_min_i32_ret: 2697; GCN2: ; %bb.0: ; %entry 2698; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2699; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 2700; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2701; GCN2-NEXT: v_mov_b32_e32 v0, s0 2702; GCN2-NEXT: v_mov_b32_e32 v1, s1 2703; GCN2-NEXT: v_mov_b32_e32 v2, s4 2704; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2705; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2706; GCN2-NEXT: v_mov_b32_e32 v0, s2 2707; GCN2-NEXT: v_mov_b32_e32 v1, s3 2708; GCN2-NEXT: s_waitcnt vmcnt(0) 2709; GCN2-NEXT: flat_store_dword v[0:1], v2 2710; GCN2-NEXT: s_endpgm 2711; 2712; GCN3-LABEL: atomic_min_i32_ret: 2713; GCN3: ; %bb.0: ; %entry 2714; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2715; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 2716; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2717; GCN3-NEXT: v_mov_b32_e32 v0, s0 2718; GCN3-NEXT: v_mov_b32_e32 v1, s1 2719; GCN3-NEXT: v_mov_b32_e32 v2, s6 2720; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2721; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2722; GCN3-NEXT: v_mov_b32_e32 v0, s2 2723; GCN3-NEXT: v_mov_b32_e32 v1, s3 2724; GCN3-NEXT: s_waitcnt vmcnt(0) 2725; GCN3-NEXT: flat_store_dword v[0:1], v2 2726; GCN3-NEXT: s_endpgm 2727entry: 2728 %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst 2729 store i32 %val, ptr %out2 2730 ret void 2731} 2732 2733define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { 2734; GCN1-LABEL: atomic_min_i32_addr64: 2735; GCN1: ; %bb.0: ; %entry 2736; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2737; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 2738; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 2739; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2740; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2741; GCN1-NEXT: s_add_u32 s0, s2, s0 2742; GCN1-NEXT: s_addc_u32 s1, s3, s1 2743; GCN1-NEXT: v_mov_b32_e32 v0, s0 2744; GCN1-NEXT: v_mov_b32_e32 v1, s1 2745; GCN1-NEXT: v_mov_b32_e32 v2, s4 2746; GCN1-NEXT: flat_atomic_smin v[0:1], v2 2747; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2748; GCN1-NEXT: s_endpgm 2749; 2750; GCN2-LABEL: atomic_min_i32_addr64: 2751; GCN2: ; %bb.0: ; %entry 2752; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2753; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2754; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 2755; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2756; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2757; GCN2-NEXT: s_add_u32 s0, s2, s0 2758; GCN2-NEXT: s_addc_u32 s1, s3, s1 2759; GCN2-NEXT: v_mov_b32_e32 v0, s0 2760; GCN2-NEXT: v_mov_b32_e32 v1, s1 2761; GCN2-NEXT: v_mov_b32_e32 v2, s4 2762; GCN2-NEXT: flat_atomic_smin v[0:1], v2 2763; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2764; GCN2-NEXT: s_endpgm 2765; 2766; GCN3-LABEL: atomic_min_i32_addr64: 2767; GCN3: ; %bb.0: ; %entry 2768; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2769; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2770; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 2771; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2772; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2773; GCN3-NEXT: s_add_u32 s0, s2, s0 2774; GCN3-NEXT: s_addc_u32 s1, s3, s1 2775; GCN3-NEXT: v_mov_b32_e32 v0, s0 2776; GCN3-NEXT: v_mov_b32_e32 v1, s1 2777; GCN3-NEXT: v_mov_b32_e32 v2, s6 2778; GCN3-NEXT: flat_atomic_smin v[0:1], v2 2779; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2780; GCN3-NEXT: s_endpgm 2781entry: 2782 %ptr = getelementptr i32, ptr %out, i64 %index 2783 %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst 2784 ret void 2785} 2786 2787define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 2788; GCN1-LABEL: atomic_min_i32_ret_addr64: 2789; GCN1: ; %bb.0: ; %entry 2790; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 2791; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2792; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 2793; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2794; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2795; GCN1-NEXT: s_add_u32 s0, s0, s4 2796; GCN1-NEXT: s_addc_u32 s1, s1, s5 2797; GCN1-NEXT: v_mov_b32_e32 v0, s0 2798; GCN1-NEXT: v_mov_b32_e32 v1, s1 2799; GCN1-NEXT: v_mov_b32_e32 v2, s8 2800; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2801; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2802; GCN1-NEXT: v_mov_b32_e32 v0, s2 2803; GCN1-NEXT: v_mov_b32_e32 v1, s3 2804; GCN1-NEXT: s_waitcnt vmcnt(0) 2805; GCN1-NEXT: flat_store_dword v[0:1], v2 2806; GCN1-NEXT: s_endpgm 2807; 2808; GCN2-LABEL: atomic_min_i32_ret_addr64: 2809; GCN2: ; %bb.0: ; %entry 2810; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2811; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2812; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 2813; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2814; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2815; GCN2-NEXT: s_add_u32 s0, s0, s4 2816; GCN2-NEXT: s_addc_u32 s1, s1, s5 2817; GCN2-NEXT: v_mov_b32_e32 v0, s0 2818; GCN2-NEXT: v_mov_b32_e32 v1, s1 2819; GCN2-NEXT: v_mov_b32_e32 v2, s8 2820; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2821; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2822; GCN2-NEXT: v_mov_b32_e32 v0, s2 2823; GCN2-NEXT: v_mov_b32_e32 v1, s3 2824; GCN2-NEXT: s_waitcnt vmcnt(0) 2825; GCN2-NEXT: flat_store_dword v[0:1], v2 2826; GCN2-NEXT: s_endpgm 2827; 2828; GCN3-LABEL: atomic_min_i32_ret_addr64: 2829; GCN3: ; %bb.0: ; %entry 2830; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2831; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2832; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 2833; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2834; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 2835; GCN3-NEXT: s_add_u32 s0, s0, s4 2836; GCN3-NEXT: s_addc_u32 s1, s1, s5 2837; GCN3-NEXT: v_mov_b32_e32 v0, s0 2838; GCN3-NEXT: v_mov_b32_e32 v1, s1 2839; GCN3-NEXT: v_mov_b32_e32 v2, s8 2840; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc 2841; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2842; GCN3-NEXT: v_mov_b32_e32 v0, s2 2843; GCN3-NEXT: v_mov_b32_e32 v1, s3 2844; GCN3-NEXT: s_waitcnt vmcnt(0) 2845; GCN3-NEXT: flat_store_dword v[0:1], v2 2846; GCN3-NEXT: s_endpgm 2847entry: 2848 %ptr = getelementptr i32, ptr %out, i64 %index 2849 %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst 2850 store i32 %val, ptr %out2 2851 ret void 2852} 2853 2854define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { 2855; GCN1-LABEL: atomic_umin_i32_offset: 2856; GCN1: ; %bb.0: ; %entry 2857; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2858; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 2859; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2860; GCN1-NEXT: s_add_u32 s0, s0, 16 2861; GCN1-NEXT: s_addc_u32 s1, s1, 0 2862; GCN1-NEXT: v_mov_b32_e32 v0, s0 2863; GCN1-NEXT: v_mov_b32_e32 v1, s1 2864; GCN1-NEXT: v_mov_b32_e32 v2, s2 2865; GCN1-NEXT: flat_atomic_umin v[0:1], v2 2866; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2867; GCN1-NEXT: s_endpgm 2868; 2869; GCN2-LABEL: atomic_umin_i32_offset: 2870; GCN2: ; %bb.0: ; %entry 2871; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2872; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 2873; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2874; GCN2-NEXT: s_add_u32 s0, s0, 16 2875; GCN2-NEXT: s_addc_u32 s1, s1, 0 2876; GCN2-NEXT: v_mov_b32_e32 v0, s0 2877; GCN2-NEXT: v_mov_b32_e32 v1, s1 2878; GCN2-NEXT: v_mov_b32_e32 v2, s2 2879; GCN2-NEXT: flat_atomic_umin v[0:1], v2 2880; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2881; GCN2-NEXT: s_endpgm 2882; 2883; GCN3-LABEL: atomic_umin_i32_offset: 2884; GCN3: ; %bb.0: ; %entry 2885; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2886; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 2887; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2888; GCN3-NEXT: v_mov_b32_e32 v0, s0 2889; GCN3-NEXT: v_mov_b32_e32 v1, s1 2890; GCN3-NEXT: v_mov_b32_e32 v2, s2 2891; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 2892; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2893; GCN3-NEXT: s_endpgm 2894entry: 2895 %gep = getelementptr i32, ptr %out, i32 4 2896 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst 2897 ret void 2898} 2899 2900define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 2901; GCN1-LABEL: atomic_umin_i32_ret_offset: 2902; GCN1: ; %bb.0: ; %entry 2903; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2904; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 2905; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2906; GCN1-NEXT: s_add_u32 s0, s0, 16 2907; GCN1-NEXT: s_addc_u32 s1, s1, 0 2908; GCN1-NEXT: v_mov_b32_e32 v0, s0 2909; GCN1-NEXT: v_mov_b32_e32 v1, s1 2910; GCN1-NEXT: v_mov_b32_e32 v2, s4 2911; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 2912; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2913; GCN1-NEXT: v_mov_b32_e32 v0, s2 2914; GCN1-NEXT: v_mov_b32_e32 v1, s3 2915; GCN1-NEXT: s_waitcnt vmcnt(0) 2916; GCN1-NEXT: flat_store_dword v[0:1], v2 2917; GCN1-NEXT: s_endpgm 2918; 2919; GCN2-LABEL: atomic_umin_i32_ret_offset: 2920; GCN2: ; %bb.0: ; %entry 2921; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2922; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 2923; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2924; GCN2-NEXT: s_add_u32 s0, s0, 16 2925; GCN2-NEXT: s_addc_u32 s1, s1, 0 2926; GCN2-NEXT: v_mov_b32_e32 v0, s0 2927; GCN2-NEXT: v_mov_b32_e32 v1, s1 2928; GCN2-NEXT: v_mov_b32_e32 v2, s4 2929; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 2930; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2931; GCN2-NEXT: v_mov_b32_e32 v0, s2 2932; GCN2-NEXT: v_mov_b32_e32 v1, s3 2933; GCN2-NEXT: s_waitcnt vmcnt(0) 2934; GCN2-NEXT: flat_store_dword v[0:1], v2 2935; GCN2-NEXT: s_endpgm 2936; 2937; GCN3-LABEL: atomic_umin_i32_ret_offset: 2938; GCN3: ; %bb.0: ; %entry 2939; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2940; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 2941; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2942; GCN3-NEXT: v_mov_b32_e32 v0, s0 2943; GCN3-NEXT: v_mov_b32_e32 v1, s1 2944; GCN3-NEXT: v_mov_b32_e32 v2, s6 2945; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc 2946; GCN3-NEXT: s_waitcnt lgkmcnt(0) 2947; GCN3-NEXT: v_mov_b32_e32 v0, s2 2948; GCN3-NEXT: v_mov_b32_e32 v1, s3 2949; GCN3-NEXT: s_waitcnt vmcnt(0) 2950; GCN3-NEXT: flat_store_dword v[0:1], v2 2951; GCN3-NEXT: s_endpgm 2952entry: 2953 %gep = getelementptr i32, ptr %out, i32 4 2954 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst 2955 store i32 %val, ptr %out2 2956 ret void 2957} 2958 2959define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 2960; GCN1-LABEL: atomic_umin_i32_addr64_offset: 2961; GCN1: ; %bb.0: ; %entry 2962; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2963; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 2964; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 2965; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2966; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2967; GCN1-NEXT: s_add_u32 s0, s2, s0 2968; GCN1-NEXT: s_addc_u32 s1, s3, s1 2969; GCN1-NEXT: s_add_u32 s0, s0, 16 2970; GCN1-NEXT: s_addc_u32 s1, s1, 0 2971; GCN1-NEXT: v_mov_b32_e32 v0, s0 2972; GCN1-NEXT: v_mov_b32_e32 v1, s1 2973; GCN1-NEXT: v_mov_b32_e32 v2, s4 2974; GCN1-NEXT: flat_atomic_umin v[0:1], v2 2975; GCN1-NEXT: s_waitcnt lgkmcnt(0) 2976; GCN1-NEXT: s_endpgm 2977; 2978; GCN2-LABEL: atomic_umin_i32_addr64_offset: 2979; GCN2: ; %bb.0: ; %entry 2980; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2981; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 2982; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 2983; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2984; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 2985; GCN2-NEXT: s_add_u32 s0, s2, s0 2986; GCN2-NEXT: s_addc_u32 s1, s3, s1 2987; GCN2-NEXT: s_add_u32 s0, s0, 16 2988; GCN2-NEXT: s_addc_u32 s1, s1, 0 2989; GCN2-NEXT: v_mov_b32_e32 v0, s0 2990; GCN2-NEXT: v_mov_b32_e32 v1, s1 2991; GCN2-NEXT: v_mov_b32_e32 v2, s4 2992; GCN2-NEXT: flat_atomic_umin v[0:1], v2 2993; GCN2-NEXT: s_waitcnt lgkmcnt(0) 2994; GCN2-NEXT: s_endpgm 2995; 2996; GCN3-LABEL: atomic_umin_i32_addr64_offset: 2997; GCN3: ; %bb.0: ; %entry 2998; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2999; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3000; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 3001; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3002; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3003; GCN3-NEXT: s_add_u32 s0, s2, s0 3004; GCN3-NEXT: s_addc_u32 s1, s3, s1 3005; GCN3-NEXT: v_mov_b32_e32 v0, s0 3006; GCN3-NEXT: v_mov_b32_e32 v1, s1 3007; GCN3-NEXT: v_mov_b32_e32 v2, s6 3008; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 3009; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3010; GCN3-NEXT: s_endpgm 3011entry: 3012 %ptr = getelementptr i32, ptr %out, i64 %index 3013 %gep = getelementptr i32, ptr %ptr, i32 4 3014 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst 3015 ret void 3016} 3017 3018define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 3019; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: 3020; GCN1: ; %bb.0: ; %entry 3021; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 3022; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3023; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 3024; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3025; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3026; GCN1-NEXT: s_add_u32 s0, s0, s4 3027; GCN1-NEXT: s_addc_u32 s1, s1, s5 3028; GCN1-NEXT: s_add_u32 s0, s0, 16 3029; GCN1-NEXT: s_addc_u32 s1, s1, 0 3030; GCN1-NEXT: v_mov_b32_e32 v0, s0 3031; GCN1-NEXT: v_mov_b32_e32 v1, s1 3032; GCN1-NEXT: v_mov_b32_e32 v2, s8 3033; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3034; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3035; GCN1-NEXT: v_mov_b32_e32 v0, s2 3036; GCN1-NEXT: v_mov_b32_e32 v1, s3 3037; GCN1-NEXT: s_waitcnt vmcnt(0) 3038; GCN1-NEXT: flat_store_dword v[0:1], v2 3039; GCN1-NEXT: s_endpgm 3040; 3041; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: 3042; GCN2: ; %bb.0: ; %entry 3043; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3044; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3045; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 3046; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3047; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3048; GCN2-NEXT: s_add_u32 s0, s0, s4 3049; GCN2-NEXT: s_addc_u32 s1, s1, s5 3050; GCN2-NEXT: s_add_u32 s0, s0, 16 3051; GCN2-NEXT: s_addc_u32 s1, s1, 0 3052; GCN2-NEXT: v_mov_b32_e32 v0, s0 3053; GCN2-NEXT: v_mov_b32_e32 v1, s1 3054; GCN2-NEXT: v_mov_b32_e32 v2, s8 3055; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3056; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3057; GCN2-NEXT: v_mov_b32_e32 v0, s2 3058; GCN2-NEXT: v_mov_b32_e32 v1, s3 3059; GCN2-NEXT: s_waitcnt vmcnt(0) 3060; GCN2-NEXT: flat_store_dword v[0:1], v2 3061; GCN2-NEXT: s_endpgm 3062; 3063; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: 3064; GCN3: ; %bb.0: ; %entry 3065; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3066; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3067; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 3068; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3069; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3070; GCN3-NEXT: s_add_u32 s0, s0, s4 3071; GCN3-NEXT: s_addc_u32 s1, s1, s5 3072; GCN3-NEXT: v_mov_b32_e32 v0, s0 3073; GCN3-NEXT: v_mov_b32_e32 v1, s1 3074; GCN3-NEXT: v_mov_b32_e32 v2, s8 3075; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc 3076; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3077; GCN3-NEXT: v_mov_b32_e32 v0, s2 3078; GCN3-NEXT: v_mov_b32_e32 v1, s3 3079; GCN3-NEXT: s_waitcnt vmcnt(0) 3080; GCN3-NEXT: flat_store_dword v[0:1], v2 3081; GCN3-NEXT: s_endpgm 3082entry: 3083 %ptr = getelementptr i32, ptr %out, i64 %index 3084 %gep = getelementptr i32, ptr %ptr, i32 4 3085 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst 3086 store i32 %val, ptr %out2 3087 ret void 3088} 3089 3090define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { 3091; GCN1-LABEL: atomic_umin_i32: 3092; GCN1: ; %bb.0: ; %entry 3093; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3094; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 3095; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3096; GCN1-NEXT: v_mov_b32_e32 v0, s0 3097; GCN1-NEXT: v_mov_b32_e32 v1, s1 3098; GCN1-NEXT: v_mov_b32_e32 v2, s2 3099; GCN1-NEXT: flat_atomic_umin v[0:1], v2 3100; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3101; GCN1-NEXT: s_endpgm 3102; 3103; GCN2-LABEL: atomic_umin_i32: 3104; GCN2: ; %bb.0: ; %entry 3105; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3106; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 3107; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3108; GCN2-NEXT: v_mov_b32_e32 v0, s0 3109; GCN2-NEXT: v_mov_b32_e32 v1, s1 3110; GCN2-NEXT: v_mov_b32_e32 v2, s2 3111; GCN2-NEXT: flat_atomic_umin v[0:1], v2 3112; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3113; GCN2-NEXT: s_endpgm 3114; 3115; GCN3-LABEL: atomic_umin_i32: 3116; GCN3: ; %bb.0: ; %entry 3117; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3118; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 3119; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3120; GCN3-NEXT: v_mov_b32_e32 v0, s0 3121; GCN3-NEXT: v_mov_b32_e32 v1, s1 3122; GCN3-NEXT: v_mov_b32_e32 v2, s2 3123; GCN3-NEXT: flat_atomic_umin v[0:1], v2 3124; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3125; GCN3-NEXT: s_endpgm 3126entry: 3127 %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst 3128 ret void 3129} 3130 3131define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { 3132; GCN1-LABEL: atomic_umin_i32_ret: 3133; GCN1: ; %bb.0: ; %entry 3134; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3135; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 3136; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3137; GCN1-NEXT: v_mov_b32_e32 v0, s0 3138; GCN1-NEXT: v_mov_b32_e32 v1, s1 3139; GCN1-NEXT: v_mov_b32_e32 v2, s4 3140; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3141; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3142; GCN1-NEXT: v_mov_b32_e32 v0, s2 3143; GCN1-NEXT: v_mov_b32_e32 v1, s3 3144; GCN1-NEXT: s_waitcnt vmcnt(0) 3145; GCN1-NEXT: flat_store_dword v[0:1], v2 3146; GCN1-NEXT: s_endpgm 3147; 3148; GCN2-LABEL: atomic_umin_i32_ret: 3149; GCN2: ; %bb.0: ; %entry 3150; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3151; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 3152; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3153; GCN2-NEXT: v_mov_b32_e32 v0, s0 3154; GCN2-NEXT: v_mov_b32_e32 v1, s1 3155; GCN2-NEXT: v_mov_b32_e32 v2, s4 3156; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3157; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3158; GCN2-NEXT: v_mov_b32_e32 v0, s2 3159; GCN2-NEXT: v_mov_b32_e32 v1, s3 3160; GCN2-NEXT: s_waitcnt vmcnt(0) 3161; GCN2-NEXT: flat_store_dword v[0:1], v2 3162; GCN2-NEXT: s_endpgm 3163; 3164; GCN3-LABEL: atomic_umin_i32_ret: 3165; GCN3: ; %bb.0: ; %entry 3166; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3167; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 3168; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3169; GCN3-NEXT: v_mov_b32_e32 v0, s0 3170; GCN3-NEXT: v_mov_b32_e32 v1, s1 3171; GCN3-NEXT: v_mov_b32_e32 v2, s6 3172; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3173; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3174; GCN3-NEXT: v_mov_b32_e32 v0, s2 3175; GCN3-NEXT: v_mov_b32_e32 v1, s3 3176; GCN3-NEXT: s_waitcnt vmcnt(0) 3177; GCN3-NEXT: flat_store_dword v[0:1], v2 3178; GCN3-NEXT: s_endpgm 3179entry: 3180 %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst 3181 store i32 %val, ptr %out2 3182 ret void 3183} 3184 3185define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { 3186; GCN1-LABEL: atomic_umin_i32_addr64: 3187; GCN1: ; %bb.0: ; %entry 3188; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3189; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 3190; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 3191; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3192; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3193; GCN1-NEXT: s_add_u32 s0, s2, s0 3194; GCN1-NEXT: s_addc_u32 s1, s3, s1 3195; GCN1-NEXT: v_mov_b32_e32 v0, s0 3196; GCN1-NEXT: v_mov_b32_e32 v1, s1 3197; GCN1-NEXT: v_mov_b32_e32 v2, s4 3198; GCN1-NEXT: flat_atomic_umin v[0:1], v2 3199; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3200; GCN1-NEXT: s_endpgm 3201; 3202; GCN2-LABEL: atomic_umin_i32_addr64: 3203; GCN2: ; %bb.0: ; %entry 3204; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3205; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3206; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 3207; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3208; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3209; GCN2-NEXT: s_add_u32 s0, s2, s0 3210; GCN2-NEXT: s_addc_u32 s1, s3, s1 3211; GCN2-NEXT: v_mov_b32_e32 v0, s0 3212; GCN2-NEXT: v_mov_b32_e32 v1, s1 3213; GCN2-NEXT: v_mov_b32_e32 v2, s4 3214; GCN2-NEXT: flat_atomic_umin v[0:1], v2 3215; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3216; GCN2-NEXT: s_endpgm 3217; 3218; GCN3-LABEL: atomic_umin_i32_addr64: 3219; GCN3: ; %bb.0: ; %entry 3220; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3221; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3222; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 3223; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3224; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3225; GCN3-NEXT: s_add_u32 s0, s2, s0 3226; GCN3-NEXT: s_addc_u32 s1, s3, s1 3227; GCN3-NEXT: v_mov_b32_e32 v0, s0 3228; GCN3-NEXT: v_mov_b32_e32 v1, s1 3229; GCN3-NEXT: v_mov_b32_e32 v2, s6 3230; GCN3-NEXT: flat_atomic_umin v[0:1], v2 3231; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3232; GCN3-NEXT: s_endpgm 3233entry: 3234 %ptr = getelementptr i32, ptr %out, i64 %index 3235 %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst 3236 ret void 3237} 3238 3239define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 3240; GCN1-LABEL: atomic_umin_i32_ret_addr64: 3241; GCN1: ; %bb.0: ; %entry 3242; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 3243; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3244; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 3245; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3246; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3247; GCN1-NEXT: s_add_u32 s0, s0, s4 3248; GCN1-NEXT: s_addc_u32 s1, s1, s5 3249; GCN1-NEXT: v_mov_b32_e32 v0, s0 3250; GCN1-NEXT: v_mov_b32_e32 v1, s1 3251; GCN1-NEXT: v_mov_b32_e32 v2, s8 3252; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3253; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3254; GCN1-NEXT: v_mov_b32_e32 v0, s2 3255; GCN1-NEXT: v_mov_b32_e32 v1, s3 3256; GCN1-NEXT: s_waitcnt vmcnt(0) 3257; GCN1-NEXT: flat_store_dword v[0:1], v2 3258; GCN1-NEXT: s_endpgm 3259; 3260; GCN2-LABEL: atomic_umin_i32_ret_addr64: 3261; GCN2: ; %bb.0: ; %entry 3262; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3263; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3264; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 3265; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3266; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3267; GCN2-NEXT: s_add_u32 s0, s0, s4 3268; GCN2-NEXT: s_addc_u32 s1, s1, s5 3269; GCN2-NEXT: v_mov_b32_e32 v0, s0 3270; GCN2-NEXT: v_mov_b32_e32 v1, s1 3271; GCN2-NEXT: v_mov_b32_e32 v2, s8 3272; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3273; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3274; GCN2-NEXT: v_mov_b32_e32 v0, s2 3275; GCN2-NEXT: v_mov_b32_e32 v1, s3 3276; GCN2-NEXT: s_waitcnt vmcnt(0) 3277; GCN2-NEXT: flat_store_dword v[0:1], v2 3278; GCN2-NEXT: s_endpgm 3279; 3280; GCN3-LABEL: atomic_umin_i32_ret_addr64: 3281; GCN3: ; %bb.0: ; %entry 3282; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3283; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3284; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 3285; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3286; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3287; GCN3-NEXT: s_add_u32 s0, s0, s4 3288; GCN3-NEXT: s_addc_u32 s1, s1, s5 3289; GCN3-NEXT: v_mov_b32_e32 v0, s0 3290; GCN3-NEXT: v_mov_b32_e32 v1, s1 3291; GCN3-NEXT: v_mov_b32_e32 v2, s8 3292; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc 3293; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3294; GCN3-NEXT: v_mov_b32_e32 v0, s2 3295; GCN3-NEXT: v_mov_b32_e32 v1, s3 3296; GCN3-NEXT: s_waitcnt vmcnt(0) 3297; GCN3-NEXT: flat_store_dword v[0:1], v2 3298; GCN3-NEXT: s_endpgm 3299entry: 3300 %ptr = getelementptr i32, ptr %out, i64 %index 3301 %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst 3302 store i32 %val, ptr %out2 3303 ret void 3304} 3305 3306define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { 3307; GCN1-LABEL: atomic_or_i32_offset: 3308; GCN1: ; %bb.0: ; %entry 3309; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3310; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 3311; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3312; GCN1-NEXT: s_add_u32 s0, s0, 16 3313; GCN1-NEXT: s_addc_u32 s1, s1, 0 3314; GCN1-NEXT: v_mov_b32_e32 v0, s0 3315; GCN1-NEXT: v_mov_b32_e32 v1, s1 3316; GCN1-NEXT: v_mov_b32_e32 v2, s2 3317; GCN1-NEXT: flat_atomic_or v[0:1], v2 3318; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3319; GCN1-NEXT: buffer_wbinvl1_vol 3320; GCN1-NEXT: s_endpgm 3321; 3322; GCN2-LABEL: atomic_or_i32_offset: 3323; GCN2: ; %bb.0: ; %entry 3324; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3325; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 3326; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3327; GCN2-NEXT: s_add_u32 s0, s0, 16 3328; GCN2-NEXT: s_addc_u32 s1, s1, 0 3329; GCN2-NEXT: v_mov_b32_e32 v0, s0 3330; GCN2-NEXT: v_mov_b32_e32 v1, s1 3331; GCN2-NEXT: v_mov_b32_e32 v2, s2 3332; GCN2-NEXT: flat_atomic_or v[0:1], v2 3333; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3334; GCN2-NEXT: buffer_wbinvl1_vol 3335; GCN2-NEXT: s_endpgm 3336; 3337; GCN3-LABEL: atomic_or_i32_offset: 3338; GCN3: ; %bb.0: ; %entry 3339; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3340; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 3341; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3342; GCN3-NEXT: v_mov_b32_e32 v0, s0 3343; GCN3-NEXT: v_mov_b32_e32 v1, s1 3344; GCN3-NEXT: v_mov_b32_e32 v2, s2 3345; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 3346; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3347; GCN3-NEXT: buffer_wbinvl1_vol 3348; GCN3-NEXT: s_endpgm 3349entry: 3350 %gep = getelementptr i32, ptr %out, i32 4 3351 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst 3352 ret void 3353} 3354 3355define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 3356; GCN1-LABEL: atomic_or_i32_ret_offset: 3357; GCN1: ; %bb.0: ; %entry 3358; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3359; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 3360; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3361; GCN1-NEXT: s_add_u32 s0, s0, 16 3362; GCN1-NEXT: s_addc_u32 s1, s1, 0 3363; GCN1-NEXT: v_mov_b32_e32 v0, s0 3364; GCN1-NEXT: v_mov_b32_e32 v1, s1 3365; GCN1-NEXT: v_mov_b32_e32 v2, s4 3366; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3367; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3368; GCN1-NEXT: buffer_wbinvl1_vol 3369; GCN1-NEXT: v_mov_b32_e32 v0, s2 3370; GCN1-NEXT: v_mov_b32_e32 v1, s3 3371; GCN1-NEXT: flat_store_dword v[0:1], v2 3372; GCN1-NEXT: s_endpgm 3373; 3374; GCN2-LABEL: atomic_or_i32_ret_offset: 3375; GCN2: ; %bb.0: ; %entry 3376; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3377; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 3378; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3379; GCN2-NEXT: s_add_u32 s0, s0, 16 3380; GCN2-NEXT: s_addc_u32 s1, s1, 0 3381; GCN2-NEXT: v_mov_b32_e32 v0, s0 3382; GCN2-NEXT: v_mov_b32_e32 v1, s1 3383; GCN2-NEXT: v_mov_b32_e32 v2, s4 3384; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3385; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3386; GCN2-NEXT: buffer_wbinvl1_vol 3387; GCN2-NEXT: v_mov_b32_e32 v0, s2 3388; GCN2-NEXT: v_mov_b32_e32 v1, s3 3389; GCN2-NEXT: flat_store_dword v[0:1], v2 3390; GCN2-NEXT: s_endpgm 3391; 3392; GCN3-LABEL: atomic_or_i32_ret_offset: 3393; GCN3: ; %bb.0: ; %entry 3394; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3395; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 3396; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3397; GCN3-NEXT: v_mov_b32_e32 v0, s0 3398; GCN3-NEXT: v_mov_b32_e32 v1, s1 3399; GCN3-NEXT: v_mov_b32_e32 v2, s6 3400; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc 3401; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3402; GCN3-NEXT: buffer_wbinvl1_vol 3403; GCN3-NEXT: v_mov_b32_e32 v0, s2 3404; GCN3-NEXT: v_mov_b32_e32 v1, s3 3405; GCN3-NEXT: flat_store_dword v[0:1], v2 3406; GCN3-NEXT: s_endpgm 3407entry: 3408 %gep = getelementptr i32, ptr %out, i32 4 3409 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst 3410 store i32 %val, ptr %out2 3411 ret void 3412} 3413 3414define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 3415; GCN1-LABEL: atomic_or_i32_addr64_offset: 3416; GCN1: ; %bb.0: ; %entry 3417; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3418; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 3419; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 3420; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3421; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3422; GCN1-NEXT: s_add_u32 s0, s2, s0 3423; GCN1-NEXT: s_addc_u32 s1, s3, s1 3424; GCN1-NEXT: s_add_u32 s0, s0, 16 3425; GCN1-NEXT: s_addc_u32 s1, s1, 0 3426; GCN1-NEXT: v_mov_b32_e32 v0, s0 3427; GCN1-NEXT: v_mov_b32_e32 v1, s1 3428; GCN1-NEXT: v_mov_b32_e32 v2, s4 3429; GCN1-NEXT: flat_atomic_or v[0:1], v2 3430; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3431; GCN1-NEXT: buffer_wbinvl1_vol 3432; GCN1-NEXT: s_endpgm 3433; 3434; GCN2-LABEL: atomic_or_i32_addr64_offset: 3435; GCN2: ; %bb.0: ; %entry 3436; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3437; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3438; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 3439; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3440; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3441; GCN2-NEXT: s_add_u32 s0, s2, s0 3442; GCN2-NEXT: s_addc_u32 s1, s3, s1 3443; GCN2-NEXT: s_add_u32 s0, s0, 16 3444; GCN2-NEXT: s_addc_u32 s1, s1, 0 3445; GCN2-NEXT: v_mov_b32_e32 v0, s0 3446; GCN2-NEXT: v_mov_b32_e32 v1, s1 3447; GCN2-NEXT: v_mov_b32_e32 v2, s4 3448; GCN2-NEXT: flat_atomic_or v[0:1], v2 3449; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3450; GCN2-NEXT: buffer_wbinvl1_vol 3451; GCN2-NEXT: s_endpgm 3452; 3453; GCN3-LABEL: atomic_or_i32_addr64_offset: 3454; GCN3: ; %bb.0: ; %entry 3455; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3456; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3457; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 3458; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3459; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3460; GCN3-NEXT: s_add_u32 s0, s2, s0 3461; GCN3-NEXT: s_addc_u32 s1, s3, s1 3462; GCN3-NEXT: v_mov_b32_e32 v0, s0 3463; GCN3-NEXT: v_mov_b32_e32 v1, s1 3464; GCN3-NEXT: v_mov_b32_e32 v2, s6 3465; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 3466; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3467; GCN3-NEXT: buffer_wbinvl1_vol 3468; GCN3-NEXT: s_endpgm 3469entry: 3470 %ptr = getelementptr i32, ptr %out, i64 %index 3471 %gep = getelementptr i32, ptr %ptr, i32 4 3472 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst 3473 ret void 3474} 3475 3476define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 3477; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: 3478; GCN1: ; %bb.0: ; %entry 3479; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 3480; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3481; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 3482; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3483; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3484; GCN1-NEXT: s_add_u32 s0, s0, s4 3485; GCN1-NEXT: s_addc_u32 s1, s1, s5 3486; GCN1-NEXT: s_add_u32 s0, s0, 16 3487; GCN1-NEXT: s_addc_u32 s1, s1, 0 3488; GCN1-NEXT: v_mov_b32_e32 v0, s0 3489; GCN1-NEXT: v_mov_b32_e32 v1, s1 3490; GCN1-NEXT: v_mov_b32_e32 v2, s8 3491; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3492; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3493; GCN1-NEXT: buffer_wbinvl1_vol 3494; GCN1-NEXT: v_mov_b32_e32 v0, s2 3495; GCN1-NEXT: v_mov_b32_e32 v1, s3 3496; GCN1-NEXT: flat_store_dword v[0:1], v2 3497; GCN1-NEXT: s_endpgm 3498; 3499; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: 3500; GCN2: ; %bb.0: ; %entry 3501; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3502; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3503; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 3504; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3505; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3506; GCN2-NEXT: s_add_u32 s0, s0, s4 3507; GCN2-NEXT: s_addc_u32 s1, s1, s5 3508; GCN2-NEXT: s_add_u32 s0, s0, 16 3509; GCN2-NEXT: s_addc_u32 s1, s1, 0 3510; GCN2-NEXT: v_mov_b32_e32 v0, s0 3511; GCN2-NEXT: v_mov_b32_e32 v1, s1 3512; GCN2-NEXT: v_mov_b32_e32 v2, s8 3513; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3514; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3515; GCN2-NEXT: buffer_wbinvl1_vol 3516; GCN2-NEXT: v_mov_b32_e32 v0, s2 3517; GCN2-NEXT: v_mov_b32_e32 v1, s3 3518; GCN2-NEXT: flat_store_dword v[0:1], v2 3519; GCN2-NEXT: s_endpgm 3520; 3521; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: 3522; GCN3: ; %bb.0: ; %entry 3523; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3524; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3525; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 3526; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3527; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3528; GCN3-NEXT: s_add_u32 s0, s0, s4 3529; GCN3-NEXT: s_addc_u32 s1, s1, s5 3530; GCN3-NEXT: v_mov_b32_e32 v0, s0 3531; GCN3-NEXT: v_mov_b32_e32 v1, s1 3532; GCN3-NEXT: v_mov_b32_e32 v2, s8 3533; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc 3534; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3535; GCN3-NEXT: buffer_wbinvl1_vol 3536; GCN3-NEXT: v_mov_b32_e32 v0, s2 3537; GCN3-NEXT: v_mov_b32_e32 v1, s3 3538; GCN3-NEXT: flat_store_dword v[0:1], v2 3539; GCN3-NEXT: s_endpgm 3540entry: 3541 %ptr = getelementptr i32, ptr %out, i64 %index 3542 %gep = getelementptr i32, ptr %ptr, i32 4 3543 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst 3544 store i32 %val, ptr %out2 3545 ret void 3546} 3547 3548define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { 3549; GCN1-LABEL: atomic_or_i32: 3550; GCN1: ; %bb.0: ; %entry 3551; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3552; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 3553; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3554; GCN1-NEXT: v_mov_b32_e32 v0, s0 3555; GCN1-NEXT: v_mov_b32_e32 v1, s1 3556; GCN1-NEXT: v_mov_b32_e32 v2, s2 3557; GCN1-NEXT: flat_atomic_or v[0:1], v2 3558; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3559; GCN1-NEXT: buffer_wbinvl1_vol 3560; GCN1-NEXT: s_endpgm 3561; 3562; GCN2-LABEL: atomic_or_i32: 3563; GCN2: ; %bb.0: ; %entry 3564; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3565; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 3566; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3567; GCN2-NEXT: v_mov_b32_e32 v0, s0 3568; GCN2-NEXT: v_mov_b32_e32 v1, s1 3569; GCN2-NEXT: v_mov_b32_e32 v2, s2 3570; GCN2-NEXT: flat_atomic_or v[0:1], v2 3571; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3572; GCN2-NEXT: buffer_wbinvl1_vol 3573; GCN2-NEXT: s_endpgm 3574; 3575; GCN3-LABEL: atomic_or_i32: 3576; GCN3: ; %bb.0: ; %entry 3577; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3578; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 3579; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3580; GCN3-NEXT: v_mov_b32_e32 v0, s0 3581; GCN3-NEXT: v_mov_b32_e32 v1, s1 3582; GCN3-NEXT: v_mov_b32_e32 v2, s2 3583; GCN3-NEXT: flat_atomic_or v[0:1], v2 3584; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3585; GCN3-NEXT: buffer_wbinvl1_vol 3586; GCN3-NEXT: s_endpgm 3587entry: 3588 %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst 3589 ret void 3590} 3591 3592define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { 3593; GCN1-LABEL: atomic_or_i32_ret: 3594; GCN1: ; %bb.0: ; %entry 3595; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3596; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 3597; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3598; GCN1-NEXT: v_mov_b32_e32 v0, s0 3599; GCN1-NEXT: v_mov_b32_e32 v1, s1 3600; GCN1-NEXT: v_mov_b32_e32 v2, s4 3601; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3602; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3603; GCN1-NEXT: buffer_wbinvl1_vol 3604; GCN1-NEXT: v_mov_b32_e32 v0, s2 3605; GCN1-NEXT: v_mov_b32_e32 v1, s3 3606; GCN1-NEXT: flat_store_dword v[0:1], v2 3607; GCN1-NEXT: s_endpgm 3608; 3609; GCN2-LABEL: atomic_or_i32_ret: 3610; GCN2: ; %bb.0: ; %entry 3611; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3612; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 3613; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3614; GCN2-NEXT: v_mov_b32_e32 v0, s0 3615; GCN2-NEXT: v_mov_b32_e32 v1, s1 3616; GCN2-NEXT: v_mov_b32_e32 v2, s4 3617; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3618; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3619; GCN2-NEXT: buffer_wbinvl1_vol 3620; GCN2-NEXT: v_mov_b32_e32 v0, s2 3621; GCN2-NEXT: v_mov_b32_e32 v1, s3 3622; GCN2-NEXT: flat_store_dword v[0:1], v2 3623; GCN2-NEXT: s_endpgm 3624; 3625; GCN3-LABEL: atomic_or_i32_ret: 3626; GCN3: ; %bb.0: ; %entry 3627; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3628; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 3629; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3630; GCN3-NEXT: v_mov_b32_e32 v0, s0 3631; GCN3-NEXT: v_mov_b32_e32 v1, s1 3632; GCN3-NEXT: v_mov_b32_e32 v2, s6 3633; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3634; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3635; GCN3-NEXT: buffer_wbinvl1_vol 3636; GCN3-NEXT: v_mov_b32_e32 v0, s2 3637; GCN3-NEXT: v_mov_b32_e32 v1, s3 3638; GCN3-NEXT: flat_store_dword v[0:1], v2 3639; GCN3-NEXT: s_endpgm 3640entry: 3641 %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst 3642 store i32 %val, ptr %out2 3643 ret void 3644} 3645 3646define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { 3647; GCN1-LABEL: atomic_or_i32_addr64: 3648; GCN1: ; %bb.0: ; %entry 3649; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3650; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 3651; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 3652; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3653; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3654; GCN1-NEXT: s_add_u32 s0, s2, s0 3655; GCN1-NEXT: s_addc_u32 s1, s3, s1 3656; GCN1-NEXT: v_mov_b32_e32 v0, s0 3657; GCN1-NEXT: v_mov_b32_e32 v1, s1 3658; GCN1-NEXT: v_mov_b32_e32 v2, s4 3659; GCN1-NEXT: flat_atomic_or v[0:1], v2 3660; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3661; GCN1-NEXT: buffer_wbinvl1_vol 3662; GCN1-NEXT: s_endpgm 3663; 3664; GCN2-LABEL: atomic_or_i32_addr64: 3665; GCN2: ; %bb.0: ; %entry 3666; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3667; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3668; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 3669; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3670; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3671; GCN2-NEXT: s_add_u32 s0, s2, s0 3672; GCN2-NEXT: s_addc_u32 s1, s3, s1 3673; GCN2-NEXT: v_mov_b32_e32 v0, s0 3674; GCN2-NEXT: v_mov_b32_e32 v1, s1 3675; GCN2-NEXT: v_mov_b32_e32 v2, s4 3676; GCN2-NEXT: flat_atomic_or v[0:1], v2 3677; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3678; GCN2-NEXT: buffer_wbinvl1_vol 3679; GCN2-NEXT: s_endpgm 3680; 3681; GCN3-LABEL: atomic_or_i32_addr64: 3682; GCN3: ; %bb.0: ; %entry 3683; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3684; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3685; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 3686; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3687; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3688; GCN3-NEXT: s_add_u32 s0, s2, s0 3689; GCN3-NEXT: s_addc_u32 s1, s3, s1 3690; GCN3-NEXT: v_mov_b32_e32 v0, s0 3691; GCN3-NEXT: v_mov_b32_e32 v1, s1 3692; GCN3-NEXT: v_mov_b32_e32 v2, s6 3693; GCN3-NEXT: flat_atomic_or v[0:1], v2 3694; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3695; GCN3-NEXT: buffer_wbinvl1_vol 3696; GCN3-NEXT: s_endpgm 3697entry: 3698 %ptr = getelementptr i32, ptr %out, i64 %index 3699 %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst 3700 ret void 3701} 3702 3703define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 3704; GCN1-LABEL: atomic_or_i32_ret_addr64: 3705; GCN1: ; %bb.0: ; %entry 3706; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 3707; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3708; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 3709; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3710; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3711; GCN1-NEXT: s_add_u32 s0, s0, s4 3712; GCN1-NEXT: s_addc_u32 s1, s1, s5 3713; GCN1-NEXT: v_mov_b32_e32 v0, s0 3714; GCN1-NEXT: v_mov_b32_e32 v1, s1 3715; GCN1-NEXT: v_mov_b32_e32 v2, s8 3716; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3717; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3718; GCN1-NEXT: buffer_wbinvl1_vol 3719; GCN1-NEXT: v_mov_b32_e32 v0, s2 3720; GCN1-NEXT: v_mov_b32_e32 v1, s3 3721; GCN1-NEXT: flat_store_dword v[0:1], v2 3722; GCN1-NEXT: s_endpgm 3723; 3724; GCN2-LABEL: atomic_or_i32_ret_addr64: 3725; GCN2: ; %bb.0: ; %entry 3726; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3727; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3728; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 3729; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3730; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3731; GCN2-NEXT: s_add_u32 s0, s0, s4 3732; GCN2-NEXT: s_addc_u32 s1, s1, s5 3733; GCN2-NEXT: v_mov_b32_e32 v0, s0 3734; GCN2-NEXT: v_mov_b32_e32 v1, s1 3735; GCN2-NEXT: v_mov_b32_e32 v2, s8 3736; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3737; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3738; GCN2-NEXT: buffer_wbinvl1_vol 3739; GCN2-NEXT: v_mov_b32_e32 v0, s2 3740; GCN2-NEXT: v_mov_b32_e32 v1, s3 3741; GCN2-NEXT: flat_store_dword v[0:1], v2 3742; GCN2-NEXT: s_endpgm 3743; 3744; GCN3-LABEL: atomic_or_i32_ret_addr64: 3745; GCN3: ; %bb.0: ; %entry 3746; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 3747; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3748; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 3749; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3750; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3751; GCN3-NEXT: s_add_u32 s0, s0, s4 3752; GCN3-NEXT: s_addc_u32 s1, s1, s5 3753; GCN3-NEXT: v_mov_b32_e32 v0, s0 3754; GCN3-NEXT: v_mov_b32_e32 v1, s1 3755; GCN3-NEXT: v_mov_b32_e32 v2, s8 3756; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc 3757; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3758; GCN3-NEXT: buffer_wbinvl1_vol 3759; GCN3-NEXT: v_mov_b32_e32 v0, s2 3760; GCN3-NEXT: v_mov_b32_e32 v1, s3 3761; GCN3-NEXT: flat_store_dword v[0:1], v2 3762; GCN3-NEXT: s_endpgm 3763entry: 3764 %ptr = getelementptr i32, ptr %out, i64 %index 3765 %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst 3766 store i32 %val, ptr %out2 3767 ret void 3768} 3769 3770define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { 3771; GCN1-LABEL: atomic_xchg_i32_offset: 3772; GCN1: ; %bb.0: ; %entry 3773; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3774; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 3775; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3776; GCN1-NEXT: s_add_u32 s0, s0, 16 3777; GCN1-NEXT: s_addc_u32 s1, s1, 0 3778; GCN1-NEXT: v_mov_b32_e32 v0, s0 3779; GCN1-NEXT: v_mov_b32_e32 v1, s1 3780; GCN1-NEXT: v_mov_b32_e32 v2, s2 3781; GCN1-NEXT: flat_atomic_swap v[0:1], v2 3782; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3783; GCN1-NEXT: buffer_wbinvl1_vol 3784; GCN1-NEXT: s_endpgm 3785; 3786; GCN2-LABEL: atomic_xchg_i32_offset: 3787; GCN2: ; %bb.0: ; %entry 3788; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3789; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 3790; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3791; GCN2-NEXT: s_add_u32 s0, s0, 16 3792; GCN2-NEXT: s_addc_u32 s1, s1, 0 3793; GCN2-NEXT: v_mov_b32_e32 v0, s0 3794; GCN2-NEXT: v_mov_b32_e32 v1, s1 3795; GCN2-NEXT: v_mov_b32_e32 v2, s2 3796; GCN2-NEXT: flat_atomic_swap v[0:1], v2 3797; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3798; GCN2-NEXT: buffer_wbinvl1_vol 3799; GCN2-NEXT: s_endpgm 3800; 3801; GCN3-LABEL: atomic_xchg_i32_offset: 3802; GCN3: ; %bb.0: ; %entry 3803; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3804; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 3805; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3806; GCN3-NEXT: v_mov_b32_e32 v0, s0 3807; GCN3-NEXT: v_mov_b32_e32 v1, s1 3808; GCN3-NEXT: v_mov_b32_e32 v2, s2 3809; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 3810; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3811; GCN3-NEXT: buffer_wbinvl1_vol 3812; GCN3-NEXT: s_endpgm 3813entry: 3814 %gep = getelementptr i32, ptr %out, i32 4 3815 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst 3816 ret void 3817} 3818 3819define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { 3820; GCN1-LABEL: atomic_xchg_f32_offset: 3821; GCN1: ; %bb.0: ; %entry 3822; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3823; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 3824; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3825; GCN1-NEXT: s_add_u32 s0, s0, 16 3826; GCN1-NEXT: s_addc_u32 s1, s1, 0 3827; GCN1-NEXT: v_mov_b32_e32 v0, s0 3828; GCN1-NEXT: v_mov_b32_e32 v1, s1 3829; GCN1-NEXT: v_mov_b32_e32 v2, s2 3830; GCN1-NEXT: flat_atomic_swap v[0:1], v2 3831; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3832; GCN1-NEXT: buffer_wbinvl1_vol 3833; GCN1-NEXT: s_endpgm 3834; 3835; GCN2-LABEL: atomic_xchg_f32_offset: 3836; GCN2: ; %bb.0: ; %entry 3837; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3838; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 3839; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3840; GCN2-NEXT: s_add_u32 s0, s0, 16 3841; GCN2-NEXT: s_addc_u32 s1, s1, 0 3842; GCN2-NEXT: v_mov_b32_e32 v0, s0 3843; GCN2-NEXT: v_mov_b32_e32 v1, s1 3844; GCN2-NEXT: v_mov_b32_e32 v2, s2 3845; GCN2-NEXT: flat_atomic_swap v[0:1], v2 3846; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3847; GCN2-NEXT: buffer_wbinvl1_vol 3848; GCN2-NEXT: s_endpgm 3849; 3850; GCN3-LABEL: atomic_xchg_f32_offset: 3851; GCN3: ; %bb.0: ; %entry 3852; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3853; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 3854; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3855; GCN3-NEXT: v_mov_b32_e32 v0, s0 3856; GCN3-NEXT: v_mov_b32_e32 v1, s1 3857; GCN3-NEXT: v_mov_b32_e32 v2, s2 3858; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 3859; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3860; GCN3-NEXT: buffer_wbinvl1_vol 3861; GCN3-NEXT: s_endpgm 3862entry: 3863 %gep = getelementptr float, ptr %out, i32 4 3864 %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst 3865 ret void 3866} 3867 3868define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 3869; GCN1-LABEL: atomic_xchg_i32_ret_offset: 3870; GCN1: ; %bb.0: ; %entry 3871; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3872; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 3873; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3874; GCN1-NEXT: s_add_u32 s0, s0, 16 3875; GCN1-NEXT: s_addc_u32 s1, s1, 0 3876; GCN1-NEXT: v_mov_b32_e32 v0, s0 3877; GCN1-NEXT: v_mov_b32_e32 v1, s1 3878; GCN1-NEXT: v_mov_b32_e32 v2, s4 3879; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3880; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3881; GCN1-NEXT: buffer_wbinvl1_vol 3882; GCN1-NEXT: v_mov_b32_e32 v0, s2 3883; GCN1-NEXT: v_mov_b32_e32 v1, s3 3884; GCN1-NEXT: flat_store_dword v[0:1], v2 3885; GCN1-NEXT: s_endpgm 3886; 3887; GCN2-LABEL: atomic_xchg_i32_ret_offset: 3888; GCN2: ; %bb.0: ; %entry 3889; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3890; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 3891; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3892; GCN2-NEXT: s_add_u32 s0, s0, 16 3893; GCN2-NEXT: s_addc_u32 s1, s1, 0 3894; GCN2-NEXT: v_mov_b32_e32 v0, s0 3895; GCN2-NEXT: v_mov_b32_e32 v1, s1 3896; GCN2-NEXT: v_mov_b32_e32 v2, s4 3897; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3898; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3899; GCN2-NEXT: buffer_wbinvl1_vol 3900; GCN2-NEXT: v_mov_b32_e32 v0, s2 3901; GCN2-NEXT: v_mov_b32_e32 v1, s3 3902; GCN2-NEXT: flat_store_dword v[0:1], v2 3903; GCN2-NEXT: s_endpgm 3904; 3905; GCN3-LABEL: atomic_xchg_i32_ret_offset: 3906; GCN3: ; %bb.0: ; %entry 3907; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3908; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 3909; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3910; GCN3-NEXT: v_mov_b32_e32 v0, s0 3911; GCN3-NEXT: v_mov_b32_e32 v1, s1 3912; GCN3-NEXT: v_mov_b32_e32 v2, s6 3913; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc 3914; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3915; GCN3-NEXT: buffer_wbinvl1_vol 3916; GCN3-NEXT: v_mov_b32_e32 v0, s2 3917; GCN3-NEXT: v_mov_b32_e32 v1, s3 3918; GCN3-NEXT: flat_store_dword v[0:1], v2 3919; GCN3-NEXT: s_endpgm 3920entry: 3921 %gep = getelementptr i32, ptr %out, i32 4 3922 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst 3923 store i32 %val, ptr %out2 3924 ret void 3925} 3926 3927define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 3928; GCN1-LABEL: atomic_xchg_i32_addr64_offset: 3929; GCN1: ; %bb.0: ; %entry 3930; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3931; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 3932; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 3933; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3934; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3935; GCN1-NEXT: s_add_u32 s0, s2, s0 3936; GCN1-NEXT: s_addc_u32 s1, s3, s1 3937; GCN1-NEXT: s_add_u32 s0, s0, 16 3938; GCN1-NEXT: s_addc_u32 s1, s1, 0 3939; GCN1-NEXT: v_mov_b32_e32 v0, s0 3940; GCN1-NEXT: v_mov_b32_e32 v1, s1 3941; GCN1-NEXT: v_mov_b32_e32 v2, s4 3942; GCN1-NEXT: flat_atomic_swap v[0:1], v2 3943; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3944; GCN1-NEXT: buffer_wbinvl1_vol 3945; GCN1-NEXT: s_endpgm 3946; 3947; GCN2-LABEL: atomic_xchg_i32_addr64_offset: 3948; GCN2: ; %bb.0: ; %entry 3949; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3950; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3951; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 3952; GCN2-NEXT: s_waitcnt lgkmcnt(0) 3953; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3954; GCN2-NEXT: s_add_u32 s0, s2, s0 3955; GCN2-NEXT: s_addc_u32 s1, s3, s1 3956; GCN2-NEXT: s_add_u32 s0, s0, 16 3957; GCN2-NEXT: s_addc_u32 s1, s1, 0 3958; GCN2-NEXT: v_mov_b32_e32 v0, s0 3959; GCN2-NEXT: v_mov_b32_e32 v1, s1 3960; GCN2-NEXT: v_mov_b32_e32 v2, s4 3961; GCN2-NEXT: flat_atomic_swap v[0:1], v2 3962; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3963; GCN2-NEXT: buffer_wbinvl1_vol 3964; GCN2-NEXT: s_endpgm 3965; 3966; GCN3-LABEL: atomic_xchg_i32_addr64_offset: 3967; GCN3: ; %bb.0: ; %entry 3968; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3969; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 3970; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 3971; GCN3-NEXT: s_waitcnt lgkmcnt(0) 3972; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 3973; GCN3-NEXT: s_add_u32 s0, s2, s0 3974; GCN3-NEXT: s_addc_u32 s1, s3, s1 3975; GCN3-NEXT: v_mov_b32_e32 v0, s0 3976; GCN3-NEXT: v_mov_b32_e32 v1, s1 3977; GCN3-NEXT: v_mov_b32_e32 v2, s6 3978; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 3979; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3980; GCN3-NEXT: buffer_wbinvl1_vol 3981; GCN3-NEXT: s_endpgm 3982entry: 3983 %ptr = getelementptr i32, ptr %out, i64 %index 3984 %gep = getelementptr i32, ptr %ptr, i32 4 3985 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst 3986 ret void 3987} 3988 3989define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 3990; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: 3991; GCN1: ; %bb.0: ; %entry 3992; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 3993; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3994; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 3995; GCN1-NEXT: s_waitcnt lgkmcnt(0) 3996; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 3997; GCN1-NEXT: s_add_u32 s0, s0, s4 3998; GCN1-NEXT: s_addc_u32 s1, s1, s5 3999; GCN1-NEXT: s_add_u32 s0, s0, 16 4000; GCN1-NEXT: s_addc_u32 s1, s1, 0 4001; GCN1-NEXT: v_mov_b32_e32 v0, s0 4002; GCN1-NEXT: v_mov_b32_e32 v1, s1 4003; GCN1-NEXT: v_mov_b32_e32 v2, s8 4004; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4005; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4006; GCN1-NEXT: buffer_wbinvl1_vol 4007; GCN1-NEXT: v_mov_b32_e32 v0, s2 4008; GCN1-NEXT: v_mov_b32_e32 v1, s3 4009; GCN1-NEXT: flat_store_dword v[0:1], v2 4010; GCN1-NEXT: s_endpgm 4011; 4012; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: 4013; GCN2: ; %bb.0: ; %entry 4014; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4015; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4016; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 4017; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4018; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4019; GCN2-NEXT: s_add_u32 s0, s0, s4 4020; GCN2-NEXT: s_addc_u32 s1, s1, s5 4021; GCN2-NEXT: s_add_u32 s0, s0, 16 4022; GCN2-NEXT: s_addc_u32 s1, s1, 0 4023; GCN2-NEXT: v_mov_b32_e32 v0, s0 4024; GCN2-NEXT: v_mov_b32_e32 v1, s1 4025; GCN2-NEXT: v_mov_b32_e32 v2, s8 4026; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4027; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4028; GCN2-NEXT: buffer_wbinvl1_vol 4029; GCN2-NEXT: v_mov_b32_e32 v0, s2 4030; GCN2-NEXT: v_mov_b32_e32 v1, s3 4031; GCN2-NEXT: flat_store_dword v[0:1], v2 4032; GCN2-NEXT: s_endpgm 4033; 4034; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: 4035; GCN3: ; %bb.0: ; %entry 4036; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4037; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4038; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 4039; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4040; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4041; GCN3-NEXT: s_add_u32 s0, s0, s4 4042; GCN3-NEXT: s_addc_u32 s1, s1, s5 4043; GCN3-NEXT: v_mov_b32_e32 v0, s0 4044; GCN3-NEXT: v_mov_b32_e32 v1, s1 4045; GCN3-NEXT: v_mov_b32_e32 v2, s8 4046; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc 4047; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4048; GCN3-NEXT: buffer_wbinvl1_vol 4049; GCN3-NEXT: v_mov_b32_e32 v0, s2 4050; GCN3-NEXT: v_mov_b32_e32 v1, s3 4051; GCN3-NEXT: flat_store_dword v[0:1], v2 4052; GCN3-NEXT: s_endpgm 4053entry: 4054 %ptr = getelementptr i32, ptr %out, i64 %index 4055 %gep = getelementptr i32, ptr %ptr, i32 4 4056 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst 4057 store i32 %val, ptr %out2 4058 ret void 4059} 4060 4061define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { 4062; GCN1-LABEL: atomic_xchg_i32: 4063; GCN1: ; %bb.0: ; %entry 4064; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4065; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 4066; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4067; GCN1-NEXT: v_mov_b32_e32 v0, s0 4068; GCN1-NEXT: v_mov_b32_e32 v1, s1 4069; GCN1-NEXT: v_mov_b32_e32 v2, s2 4070; GCN1-NEXT: flat_atomic_swap v[0:1], v2 4071; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4072; GCN1-NEXT: buffer_wbinvl1_vol 4073; GCN1-NEXT: s_endpgm 4074; 4075; GCN2-LABEL: atomic_xchg_i32: 4076; GCN2: ; %bb.0: ; %entry 4077; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4078; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 4079; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4080; GCN2-NEXT: v_mov_b32_e32 v0, s0 4081; GCN2-NEXT: v_mov_b32_e32 v1, s1 4082; GCN2-NEXT: v_mov_b32_e32 v2, s2 4083; GCN2-NEXT: flat_atomic_swap v[0:1], v2 4084; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4085; GCN2-NEXT: buffer_wbinvl1_vol 4086; GCN2-NEXT: s_endpgm 4087; 4088; GCN3-LABEL: atomic_xchg_i32: 4089; GCN3: ; %bb.0: ; %entry 4090; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4091; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 4092; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4093; GCN3-NEXT: v_mov_b32_e32 v0, s0 4094; GCN3-NEXT: v_mov_b32_e32 v1, s1 4095; GCN3-NEXT: v_mov_b32_e32 v2, s2 4096; GCN3-NEXT: flat_atomic_swap v[0:1], v2 4097; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4098; GCN3-NEXT: buffer_wbinvl1_vol 4099; GCN3-NEXT: s_endpgm 4100entry: 4101 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst 4102 ret void 4103} 4104 4105define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { 4106; GCN1-LABEL: atomic_xchg_i32_ret: 4107; GCN1: ; %bb.0: ; %entry 4108; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4109; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 4110; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4111; GCN1-NEXT: v_mov_b32_e32 v0, s0 4112; GCN1-NEXT: v_mov_b32_e32 v1, s1 4113; GCN1-NEXT: v_mov_b32_e32 v2, s4 4114; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4115; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4116; GCN1-NEXT: buffer_wbinvl1_vol 4117; GCN1-NEXT: v_mov_b32_e32 v0, s2 4118; GCN1-NEXT: v_mov_b32_e32 v1, s3 4119; GCN1-NEXT: flat_store_dword v[0:1], v2 4120; GCN1-NEXT: s_endpgm 4121; 4122; GCN2-LABEL: atomic_xchg_i32_ret: 4123; GCN2: ; %bb.0: ; %entry 4124; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4125; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 4126; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4127; GCN2-NEXT: v_mov_b32_e32 v0, s0 4128; GCN2-NEXT: v_mov_b32_e32 v1, s1 4129; GCN2-NEXT: v_mov_b32_e32 v2, s4 4130; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4131; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4132; GCN2-NEXT: buffer_wbinvl1_vol 4133; GCN2-NEXT: v_mov_b32_e32 v0, s2 4134; GCN2-NEXT: v_mov_b32_e32 v1, s3 4135; GCN2-NEXT: flat_store_dword v[0:1], v2 4136; GCN2-NEXT: s_endpgm 4137; 4138; GCN3-LABEL: atomic_xchg_i32_ret: 4139; GCN3: ; %bb.0: ; %entry 4140; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4141; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 4142; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4143; GCN3-NEXT: v_mov_b32_e32 v0, s0 4144; GCN3-NEXT: v_mov_b32_e32 v1, s1 4145; GCN3-NEXT: v_mov_b32_e32 v2, s6 4146; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4147; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4148; GCN3-NEXT: buffer_wbinvl1_vol 4149; GCN3-NEXT: v_mov_b32_e32 v0, s2 4150; GCN3-NEXT: v_mov_b32_e32 v1, s3 4151; GCN3-NEXT: flat_store_dword v[0:1], v2 4152; GCN3-NEXT: s_endpgm 4153entry: 4154 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst 4155 store i32 %val, ptr %out2 4156 ret void 4157} 4158 4159define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { 4160; GCN1-LABEL: atomic_xchg_i32_addr64: 4161; GCN1: ; %bb.0: ; %entry 4162; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 4163; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 4164; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 4165; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4166; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4167; GCN1-NEXT: s_add_u32 s0, s2, s0 4168; GCN1-NEXT: s_addc_u32 s1, s3, s1 4169; GCN1-NEXT: v_mov_b32_e32 v0, s0 4170; GCN1-NEXT: v_mov_b32_e32 v1, s1 4171; GCN1-NEXT: v_mov_b32_e32 v2, s4 4172; GCN1-NEXT: flat_atomic_swap v[0:1], v2 4173; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4174; GCN1-NEXT: buffer_wbinvl1_vol 4175; GCN1-NEXT: s_endpgm 4176; 4177; GCN2-LABEL: atomic_xchg_i32_addr64: 4178; GCN2: ; %bb.0: ; %entry 4179; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4180; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4181; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 4182; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4183; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4184; GCN2-NEXT: s_add_u32 s0, s2, s0 4185; GCN2-NEXT: s_addc_u32 s1, s3, s1 4186; GCN2-NEXT: v_mov_b32_e32 v0, s0 4187; GCN2-NEXT: v_mov_b32_e32 v1, s1 4188; GCN2-NEXT: v_mov_b32_e32 v2, s4 4189; GCN2-NEXT: flat_atomic_swap v[0:1], v2 4190; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4191; GCN2-NEXT: buffer_wbinvl1_vol 4192; GCN2-NEXT: s_endpgm 4193; 4194; GCN3-LABEL: atomic_xchg_i32_addr64: 4195; GCN3: ; %bb.0: ; %entry 4196; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4197; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4198; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 4199; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4200; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4201; GCN3-NEXT: s_add_u32 s0, s2, s0 4202; GCN3-NEXT: s_addc_u32 s1, s3, s1 4203; GCN3-NEXT: v_mov_b32_e32 v0, s0 4204; GCN3-NEXT: v_mov_b32_e32 v1, s1 4205; GCN3-NEXT: v_mov_b32_e32 v2, s6 4206; GCN3-NEXT: flat_atomic_swap v[0:1], v2 4207; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4208; GCN3-NEXT: buffer_wbinvl1_vol 4209; GCN3-NEXT: s_endpgm 4210entry: 4211 %ptr = getelementptr i32, ptr %out, i64 %index 4212 %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst 4213 ret void 4214} 4215 4216define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 4217; GCN1-LABEL: atomic_xchg_i32_ret_addr64: 4218; GCN1: ; %bb.0: ; %entry 4219; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 4220; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4221; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 4222; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4223; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4224; GCN1-NEXT: s_add_u32 s0, s0, s4 4225; GCN1-NEXT: s_addc_u32 s1, s1, s5 4226; GCN1-NEXT: v_mov_b32_e32 v0, s0 4227; GCN1-NEXT: v_mov_b32_e32 v1, s1 4228; GCN1-NEXT: v_mov_b32_e32 v2, s8 4229; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4230; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4231; GCN1-NEXT: buffer_wbinvl1_vol 4232; GCN1-NEXT: v_mov_b32_e32 v0, s2 4233; GCN1-NEXT: v_mov_b32_e32 v1, s3 4234; GCN1-NEXT: flat_store_dword v[0:1], v2 4235; GCN1-NEXT: s_endpgm 4236; 4237; GCN2-LABEL: atomic_xchg_i32_ret_addr64: 4238; GCN2: ; %bb.0: ; %entry 4239; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4240; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4241; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 4242; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4243; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4244; GCN2-NEXT: s_add_u32 s0, s0, s4 4245; GCN2-NEXT: s_addc_u32 s1, s1, s5 4246; GCN2-NEXT: v_mov_b32_e32 v0, s0 4247; GCN2-NEXT: v_mov_b32_e32 v1, s1 4248; GCN2-NEXT: v_mov_b32_e32 v2, s8 4249; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4250; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4251; GCN2-NEXT: buffer_wbinvl1_vol 4252; GCN2-NEXT: v_mov_b32_e32 v0, s2 4253; GCN2-NEXT: v_mov_b32_e32 v1, s3 4254; GCN2-NEXT: flat_store_dword v[0:1], v2 4255; GCN2-NEXT: s_endpgm 4256; 4257; GCN3-LABEL: atomic_xchg_i32_ret_addr64: 4258; GCN3: ; %bb.0: ; %entry 4259; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4260; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4261; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 4262; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4263; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4264; GCN3-NEXT: s_add_u32 s0, s0, s4 4265; GCN3-NEXT: s_addc_u32 s1, s1, s5 4266; GCN3-NEXT: v_mov_b32_e32 v0, s0 4267; GCN3-NEXT: v_mov_b32_e32 v1, s1 4268; GCN3-NEXT: v_mov_b32_e32 v2, s8 4269; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4270; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4271; GCN3-NEXT: buffer_wbinvl1_vol 4272; GCN3-NEXT: v_mov_b32_e32 v0, s2 4273; GCN3-NEXT: v_mov_b32_e32 v1, s3 4274; GCN3-NEXT: flat_store_dword v[0:1], v2 4275; GCN3-NEXT: s_endpgm 4276entry: 4277 %ptr = getelementptr i32, ptr %out, i64 %index 4278 %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst 4279 store i32 %val, ptr %out2 4280 ret void 4281} 4282 4283; CMP_SWAP 4284 4285define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { 4286; GCN1-LABEL: atomic_cmpxchg_i32_offset: 4287; GCN1: ; %bb.0: ; %entry 4288; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4289; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4290; GCN1-NEXT: s_add_u32 s0, s0, 16 4291; GCN1-NEXT: s_addc_u32 s1, s1, 0 4292; GCN1-NEXT: v_mov_b32_e32 v0, s0 4293; GCN1-NEXT: v_mov_b32_e32 v2, s2 4294; GCN1-NEXT: v_mov_b32_e32 v1, s1 4295; GCN1-NEXT: v_mov_b32_e32 v3, s3 4296; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4297; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4298; GCN1-NEXT: buffer_wbinvl1_vol 4299; GCN1-NEXT: s_endpgm 4300; 4301; GCN2-LABEL: atomic_cmpxchg_i32_offset: 4302; GCN2: ; %bb.0: ; %entry 4303; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4304; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4305; GCN2-NEXT: s_add_u32 s0, s0, 16 4306; GCN2-NEXT: s_addc_u32 s1, s1, 0 4307; GCN2-NEXT: v_mov_b32_e32 v0, s0 4308; GCN2-NEXT: v_mov_b32_e32 v2, s2 4309; GCN2-NEXT: v_mov_b32_e32 v1, s1 4310; GCN2-NEXT: v_mov_b32_e32 v3, s3 4311; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4312; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4313; GCN2-NEXT: buffer_wbinvl1_vol 4314; GCN2-NEXT: s_endpgm 4315; 4316; GCN3-LABEL: atomic_cmpxchg_i32_offset: 4317; GCN3: ; %bb.0: ; %entry 4318; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4319; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4320; GCN3-NEXT: v_mov_b32_e32 v0, s0 4321; GCN3-NEXT: v_mov_b32_e32 v2, s2 4322; GCN3-NEXT: v_mov_b32_e32 v1, s1 4323; GCN3-NEXT: v_mov_b32_e32 v3, s3 4324; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4325; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4326; GCN3-NEXT: buffer_wbinvl1_vol 4327; GCN3-NEXT: s_endpgm 4328entry: 4329 %gep = getelementptr i32, ptr %out, i32 4 4330 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4331 ret void 4332} 4333 4334define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { 4335; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: 4336; GCN1: ; %bb.0: ; %entry 4337; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4338; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 4339; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4340; GCN1-NEXT: s_add_u32 s0, s0, 16 4341; GCN1-NEXT: s_addc_u32 s1, s1, 0 4342; GCN1-NEXT: v_mov_b32_e32 v0, s0 4343; GCN1-NEXT: v_mov_b32_e32 v2, s4 4344; GCN1-NEXT: v_mov_b32_e32 v1, s1 4345; GCN1-NEXT: v_mov_b32_e32 v3, s5 4346; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4347; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4348; GCN1-NEXT: buffer_wbinvl1_vol 4349; GCN1-NEXT: v_mov_b32_e32 v0, s2 4350; GCN1-NEXT: v_mov_b32_e32 v1, s3 4351; GCN1-NEXT: flat_store_dword v[0:1], v2 4352; GCN1-NEXT: s_endpgm 4353; 4354; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: 4355; GCN2: ; %bb.0: ; %entry 4356; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4357; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 4358; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4359; GCN2-NEXT: s_add_u32 s0, s0, 16 4360; GCN2-NEXT: s_addc_u32 s1, s1, 0 4361; GCN2-NEXT: v_mov_b32_e32 v0, s0 4362; GCN2-NEXT: v_mov_b32_e32 v2, s4 4363; GCN2-NEXT: v_mov_b32_e32 v1, s1 4364; GCN2-NEXT: v_mov_b32_e32 v3, s5 4365; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4366; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4367; GCN2-NEXT: buffer_wbinvl1_vol 4368; GCN2-NEXT: v_mov_b32_e32 v0, s2 4369; GCN2-NEXT: v_mov_b32_e32 v1, s3 4370; GCN2-NEXT: flat_store_dword v[0:1], v2 4371; GCN2-NEXT: s_endpgm 4372; 4373; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: 4374; GCN3: ; %bb.0: ; %entry 4375; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4376; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4377; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4378; GCN3-NEXT: v_mov_b32_e32 v0, s0 4379; GCN3-NEXT: v_mov_b32_e32 v2, s6 4380; GCN3-NEXT: v_mov_b32_e32 v1, s1 4381; GCN3-NEXT: v_mov_b32_e32 v3, s7 4382; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4383; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4384; GCN3-NEXT: buffer_wbinvl1_vol 4385; GCN3-NEXT: v_mov_b32_e32 v0, s2 4386; GCN3-NEXT: v_mov_b32_e32 v1, s3 4387; GCN3-NEXT: flat_store_dword v[0:1], v2 4388; GCN3-NEXT: s_endpgm 4389entry: 4390 %gep = getelementptr i32, ptr %out, i32 4 4391 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4392 %flag = extractvalue { i32, i1 } %val, 0 4393 store i32 %flag, ptr %out2 4394 ret void 4395} 4396 4397define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { 4398; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: 4399; GCN1: ; %bb.0: ; %entry 4400; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 4401; GCN1-NEXT: s_load_dword s6, s[4:5], 0xb 4402; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 4403; GCN1-NEXT: s_load_dword s4, s[4:5], 0xf 4404; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4405; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4406; GCN1-NEXT: v_mov_b32_e32 v0, s6 4407; GCN1-NEXT: s_add_u32 s0, s2, s0 4408; GCN1-NEXT: s_addc_u32 s1, s3, s1 4409; GCN1-NEXT: s_add_u32 s0, s0, 16 4410; GCN1-NEXT: s_addc_u32 s1, s1, 0 4411; GCN1-NEXT: v_mov_b32_e32 v3, s1 4412; GCN1-NEXT: v_mov_b32_e32 v1, s4 4413; GCN1-NEXT: v_mov_b32_e32 v2, s0 4414; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] 4415; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4416; GCN1-NEXT: buffer_wbinvl1_vol 4417; GCN1-NEXT: s_endpgm 4418; 4419; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: 4420; GCN2: ; %bb.0: ; %entry 4421; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4422; GCN2-NEXT: s_load_dword s6, s[4:5], 0x2c 4423; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4424; GCN2-NEXT: s_load_dword s4, s[4:5], 0x3c 4425; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4426; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4427; GCN2-NEXT: v_mov_b32_e32 v0, s6 4428; GCN2-NEXT: s_add_u32 s0, s2, s0 4429; GCN2-NEXT: s_addc_u32 s1, s3, s1 4430; GCN2-NEXT: s_add_u32 s0, s0, 16 4431; GCN2-NEXT: s_addc_u32 s1, s1, 0 4432; GCN2-NEXT: v_mov_b32_e32 v3, s1 4433; GCN2-NEXT: v_mov_b32_e32 v1, s4 4434; GCN2-NEXT: v_mov_b32_e32 v2, s0 4435; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] 4436; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4437; GCN2-NEXT: buffer_wbinvl1_vol 4438; GCN2-NEXT: s_endpgm 4439; 4440; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: 4441; GCN3: ; %bb.0: ; %entry 4442; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4443; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 4444; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4445; GCN3-NEXT: s_load_dword s7, s[4:5], 0x3c 4446; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4447; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4448; GCN3-NEXT: v_mov_b32_e32 v0, s6 4449; GCN3-NEXT: s_add_u32 s0, s2, s0 4450; GCN3-NEXT: s_addc_u32 s1, s3, s1 4451; GCN3-NEXT: v_mov_b32_e32 v3, s1 4452; GCN3-NEXT: v_mov_b32_e32 v1, s7 4453; GCN3-NEXT: v_mov_b32_e32 v2, s0 4454; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16 4455; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4456; GCN3-NEXT: buffer_wbinvl1_vol 4457; GCN3-NEXT: s_endpgm 4458entry: 4459 %ptr = getelementptr i32, ptr %out, i64 %index 4460 %gep = getelementptr i32, ptr %ptr, i32 4 4461 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4462 ret void 4463} 4464 4465define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { 4466; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: 4467; GCN1: ; %bb.0: ; %entry 4468; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 4469; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 4470; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4471; GCN1-NEXT: s_load_dword s9, s[4:5], 0x11 4472; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4473; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4474; GCN1-NEXT: v_mov_b32_e32 v0, s8 4475; GCN1-NEXT: s_add_u32 s0, s0, s4 4476; GCN1-NEXT: s_addc_u32 s1, s1, s5 4477; GCN1-NEXT: s_add_u32 s0, s0, 16 4478; GCN1-NEXT: s_addc_u32 s1, s1, 0 4479; GCN1-NEXT: v_mov_b32_e32 v3, s1 4480; GCN1-NEXT: v_mov_b32_e32 v1, s9 4481; GCN1-NEXT: v_mov_b32_e32 v2, s0 4482; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc 4483; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4484; GCN1-NEXT: buffer_wbinvl1_vol 4485; GCN1-NEXT: v_mov_b32_e32 v0, s2 4486; GCN1-NEXT: v_mov_b32_e32 v1, s3 4487; GCN1-NEXT: flat_store_dword v[0:1], v2 4488; GCN1-NEXT: s_endpgm 4489; 4490; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: 4491; GCN2: ; %bb.0: ; %entry 4492; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4493; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 4494; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4495; GCN2-NEXT: s_load_dword s9, s[4:5], 0x44 4496; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4497; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4498; GCN2-NEXT: v_mov_b32_e32 v0, s8 4499; GCN2-NEXT: s_add_u32 s0, s0, s4 4500; GCN2-NEXT: s_addc_u32 s1, s1, s5 4501; GCN2-NEXT: s_add_u32 s0, s0, 16 4502; GCN2-NEXT: s_addc_u32 s1, s1, 0 4503; GCN2-NEXT: v_mov_b32_e32 v3, s1 4504; GCN2-NEXT: v_mov_b32_e32 v1, s9 4505; GCN2-NEXT: v_mov_b32_e32 v2, s0 4506; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc 4507; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4508; GCN2-NEXT: buffer_wbinvl1_vol 4509; GCN2-NEXT: v_mov_b32_e32 v0, s2 4510; GCN2-NEXT: v_mov_b32_e32 v1, s3 4511; GCN2-NEXT: flat_store_dword v[0:1], v2 4512; GCN2-NEXT: s_endpgm 4513; 4514; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: 4515; GCN3: ; %bb.0: ; %entry 4516; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4517; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 4518; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4519; GCN3-NEXT: s_load_dword s9, s[4:5], 0x44 4520; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4521; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4522; GCN3-NEXT: v_mov_b32_e32 v0, s8 4523; GCN3-NEXT: s_add_u32 s0, s0, s4 4524; GCN3-NEXT: s_addc_u32 s1, s1, s5 4525; GCN3-NEXT: v_mov_b32_e32 v3, s1 4526; GCN3-NEXT: v_mov_b32_e32 v1, s9 4527; GCN3-NEXT: v_mov_b32_e32 v2, s0 4528; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc 4529; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4530; GCN3-NEXT: buffer_wbinvl1_vol 4531; GCN3-NEXT: v_mov_b32_e32 v0, s2 4532; GCN3-NEXT: v_mov_b32_e32 v1, s3 4533; GCN3-NEXT: flat_store_dword v[0:1], v2 4534; GCN3-NEXT: s_endpgm 4535entry: 4536 %ptr = getelementptr i32, ptr %out, i64 %index 4537 %gep = getelementptr i32, ptr %ptr, i32 4 4538 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4539 %flag = extractvalue { i32, i1 } %val, 0 4540 store i32 %flag, ptr %out2 4541 ret void 4542} 4543 4544define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { 4545; GCN1-LABEL: atomic_cmpxchg_i32: 4546; GCN1: ; %bb.0: ; %entry 4547; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4548; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4549; GCN1-NEXT: v_mov_b32_e32 v0, s0 4550; GCN1-NEXT: v_mov_b32_e32 v2, s2 4551; GCN1-NEXT: v_mov_b32_e32 v1, s1 4552; GCN1-NEXT: v_mov_b32_e32 v3, s3 4553; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4554; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4555; GCN1-NEXT: buffer_wbinvl1_vol 4556; GCN1-NEXT: s_endpgm 4557; 4558; GCN2-LABEL: atomic_cmpxchg_i32: 4559; GCN2: ; %bb.0: ; %entry 4560; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4561; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4562; GCN2-NEXT: v_mov_b32_e32 v0, s0 4563; GCN2-NEXT: v_mov_b32_e32 v2, s2 4564; GCN2-NEXT: v_mov_b32_e32 v1, s1 4565; GCN2-NEXT: v_mov_b32_e32 v3, s3 4566; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4567; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4568; GCN2-NEXT: buffer_wbinvl1_vol 4569; GCN2-NEXT: s_endpgm 4570; 4571; GCN3-LABEL: atomic_cmpxchg_i32: 4572; GCN3: ; %bb.0: ; %entry 4573; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4574; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4575; GCN3-NEXT: v_mov_b32_e32 v0, s0 4576; GCN3-NEXT: v_mov_b32_e32 v2, s2 4577; GCN3-NEXT: v_mov_b32_e32 v1, s1 4578; GCN3-NEXT: v_mov_b32_e32 v3, s3 4579; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4580; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4581; GCN3-NEXT: buffer_wbinvl1_vol 4582; GCN3-NEXT: s_endpgm 4583entry: 4584 %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4585 ret void 4586} 4587 4588define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { 4589; GCN1-LABEL: atomic_cmpxchg_i32_ret: 4590; GCN1: ; %bb.0: ; %entry 4591; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4592; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 4593; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4594; GCN1-NEXT: v_mov_b32_e32 v0, s0 4595; GCN1-NEXT: v_mov_b32_e32 v2, s4 4596; GCN1-NEXT: v_mov_b32_e32 v1, s1 4597; GCN1-NEXT: v_mov_b32_e32 v3, s5 4598; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4599; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4600; GCN1-NEXT: buffer_wbinvl1_vol 4601; GCN1-NEXT: v_mov_b32_e32 v0, s2 4602; GCN1-NEXT: v_mov_b32_e32 v1, s3 4603; GCN1-NEXT: flat_store_dword v[0:1], v2 4604; GCN1-NEXT: s_endpgm 4605; 4606; GCN2-LABEL: atomic_cmpxchg_i32_ret: 4607; GCN2: ; %bb.0: ; %entry 4608; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4609; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 4610; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4611; GCN2-NEXT: v_mov_b32_e32 v0, s0 4612; GCN2-NEXT: v_mov_b32_e32 v2, s4 4613; GCN2-NEXT: v_mov_b32_e32 v1, s1 4614; GCN2-NEXT: v_mov_b32_e32 v3, s5 4615; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4616; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4617; GCN2-NEXT: buffer_wbinvl1_vol 4618; GCN2-NEXT: v_mov_b32_e32 v0, s2 4619; GCN2-NEXT: v_mov_b32_e32 v1, s3 4620; GCN2-NEXT: flat_store_dword v[0:1], v2 4621; GCN2-NEXT: s_endpgm 4622; 4623; GCN3-LABEL: atomic_cmpxchg_i32_ret: 4624; GCN3: ; %bb.0: ; %entry 4625; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4626; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4627; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4628; GCN3-NEXT: v_mov_b32_e32 v0, s0 4629; GCN3-NEXT: v_mov_b32_e32 v2, s6 4630; GCN3-NEXT: v_mov_b32_e32 v1, s1 4631; GCN3-NEXT: v_mov_b32_e32 v3, s7 4632; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4633; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4634; GCN3-NEXT: buffer_wbinvl1_vol 4635; GCN3-NEXT: v_mov_b32_e32 v0, s2 4636; GCN3-NEXT: v_mov_b32_e32 v1, s3 4637; GCN3-NEXT: flat_store_dword v[0:1], v2 4638; GCN3-NEXT: s_endpgm 4639entry: 4640 %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4641 %flag = extractvalue { i32, i1 } %val, 0 4642 store i32 %flag, ptr %out2 4643 ret void 4644} 4645 4646define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { 4647; GCN1-LABEL: atomic_cmpxchg_i32_addr64: 4648; GCN1: ; %bb.0: ; %entry 4649; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 4650; GCN1-NEXT: s_load_dword s6, s[4:5], 0xb 4651; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 4652; GCN1-NEXT: s_load_dword s4, s[4:5], 0xf 4653; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4654; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4655; GCN1-NEXT: v_mov_b32_e32 v0, s6 4656; GCN1-NEXT: s_add_u32 s0, s2, s0 4657; GCN1-NEXT: s_addc_u32 s1, s3, s1 4658; GCN1-NEXT: v_mov_b32_e32 v3, s1 4659; GCN1-NEXT: v_mov_b32_e32 v1, s4 4660; GCN1-NEXT: v_mov_b32_e32 v2, s0 4661; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] 4662; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4663; GCN1-NEXT: buffer_wbinvl1_vol 4664; GCN1-NEXT: s_endpgm 4665; 4666; GCN2-LABEL: atomic_cmpxchg_i32_addr64: 4667; GCN2: ; %bb.0: ; %entry 4668; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4669; GCN2-NEXT: s_load_dword s6, s[4:5], 0x2c 4670; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4671; GCN2-NEXT: s_load_dword s4, s[4:5], 0x3c 4672; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4673; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4674; GCN2-NEXT: v_mov_b32_e32 v0, s6 4675; GCN2-NEXT: s_add_u32 s0, s2, s0 4676; GCN2-NEXT: s_addc_u32 s1, s3, s1 4677; GCN2-NEXT: v_mov_b32_e32 v3, s1 4678; GCN2-NEXT: v_mov_b32_e32 v1, s4 4679; GCN2-NEXT: v_mov_b32_e32 v2, s0 4680; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] 4681; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4682; GCN2-NEXT: buffer_wbinvl1_vol 4683; GCN2-NEXT: s_endpgm 4684; 4685; GCN3-LABEL: atomic_cmpxchg_i32_addr64: 4686; GCN3: ; %bb.0: ; %entry 4687; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4688; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 4689; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4690; GCN3-NEXT: s_load_dword s7, s[4:5], 0x3c 4691; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4692; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4693; GCN3-NEXT: v_mov_b32_e32 v0, s6 4694; GCN3-NEXT: s_add_u32 s0, s2, s0 4695; GCN3-NEXT: s_addc_u32 s1, s3, s1 4696; GCN3-NEXT: v_mov_b32_e32 v3, s1 4697; GCN3-NEXT: v_mov_b32_e32 v1, s7 4698; GCN3-NEXT: v_mov_b32_e32 v2, s0 4699; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] 4700; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4701; GCN3-NEXT: buffer_wbinvl1_vol 4702; GCN3-NEXT: s_endpgm 4703entry: 4704 %ptr = getelementptr i32, ptr %out, i64 %index 4705 %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4706 ret void 4707} 4708 4709define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { 4710; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: 4711; GCN1: ; %bb.0: ; %entry 4712; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 4713; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 4714; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4715; GCN1-NEXT: s_load_dword s9, s[4:5], 0x11 4716; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4717; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4718; GCN1-NEXT: v_mov_b32_e32 v0, s8 4719; GCN1-NEXT: s_add_u32 s0, s0, s4 4720; GCN1-NEXT: s_addc_u32 s1, s1, s5 4721; GCN1-NEXT: v_mov_b32_e32 v3, s1 4722; GCN1-NEXT: v_mov_b32_e32 v1, s9 4723; GCN1-NEXT: v_mov_b32_e32 v2, s0 4724; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc 4725; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4726; GCN1-NEXT: buffer_wbinvl1_vol 4727; GCN1-NEXT: v_mov_b32_e32 v0, s2 4728; GCN1-NEXT: v_mov_b32_e32 v1, s3 4729; GCN1-NEXT: flat_store_dword v[0:1], v2 4730; GCN1-NEXT: s_endpgm 4731; 4732; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: 4733; GCN2: ; %bb.0: ; %entry 4734; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4735; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 4736; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4737; GCN2-NEXT: s_load_dword s9, s[4:5], 0x44 4738; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4739; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4740; GCN2-NEXT: v_mov_b32_e32 v0, s8 4741; GCN2-NEXT: s_add_u32 s0, s0, s4 4742; GCN2-NEXT: s_addc_u32 s1, s1, s5 4743; GCN2-NEXT: v_mov_b32_e32 v3, s1 4744; GCN2-NEXT: v_mov_b32_e32 v1, s9 4745; GCN2-NEXT: v_mov_b32_e32 v2, s0 4746; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc 4747; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4748; GCN2-NEXT: buffer_wbinvl1_vol 4749; GCN2-NEXT: v_mov_b32_e32 v0, s2 4750; GCN2-NEXT: v_mov_b32_e32 v1, s3 4751; GCN2-NEXT: flat_store_dword v[0:1], v2 4752; GCN2-NEXT: s_endpgm 4753; 4754; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: 4755; GCN3: ; %bb.0: ; %entry 4756; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4757; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 4758; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4759; GCN3-NEXT: s_load_dword s9, s[4:5], 0x44 4760; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4761; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4762; GCN3-NEXT: v_mov_b32_e32 v0, s8 4763; GCN3-NEXT: s_add_u32 s0, s0, s4 4764; GCN3-NEXT: s_addc_u32 s1, s1, s5 4765; GCN3-NEXT: v_mov_b32_e32 v3, s1 4766; GCN3-NEXT: v_mov_b32_e32 v1, s9 4767; GCN3-NEXT: v_mov_b32_e32 v2, s0 4768; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc 4769; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4770; GCN3-NEXT: buffer_wbinvl1_vol 4771; GCN3-NEXT: v_mov_b32_e32 v0, s2 4772; GCN3-NEXT: v_mov_b32_e32 v1, s3 4773; GCN3-NEXT: flat_store_dword v[0:1], v2 4774; GCN3-NEXT: s_endpgm 4775entry: 4776 %ptr = getelementptr i32, ptr %out, i64 %index 4777 %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 4778 %flag = extractvalue { i32, i1 } %val, 0 4779 store i32 %flag, ptr %out2 4780 ret void 4781} 4782 4783define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { 4784; GCN1-LABEL: atomic_xor_i32_offset: 4785; GCN1: ; %bb.0: ; %entry 4786; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4787; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 4788; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4789; GCN1-NEXT: s_add_u32 s0, s0, 16 4790; GCN1-NEXT: s_addc_u32 s1, s1, 0 4791; GCN1-NEXT: v_mov_b32_e32 v0, s0 4792; GCN1-NEXT: v_mov_b32_e32 v1, s1 4793; GCN1-NEXT: v_mov_b32_e32 v2, s2 4794; GCN1-NEXT: flat_atomic_xor v[0:1], v2 4795; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4796; GCN1-NEXT: buffer_wbinvl1_vol 4797; GCN1-NEXT: s_endpgm 4798; 4799; GCN2-LABEL: atomic_xor_i32_offset: 4800; GCN2: ; %bb.0: ; %entry 4801; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4802; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 4803; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4804; GCN2-NEXT: s_add_u32 s0, s0, 16 4805; GCN2-NEXT: s_addc_u32 s1, s1, 0 4806; GCN2-NEXT: v_mov_b32_e32 v0, s0 4807; GCN2-NEXT: v_mov_b32_e32 v1, s1 4808; GCN2-NEXT: v_mov_b32_e32 v2, s2 4809; GCN2-NEXT: flat_atomic_xor v[0:1], v2 4810; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4811; GCN2-NEXT: buffer_wbinvl1_vol 4812; GCN2-NEXT: s_endpgm 4813; 4814; GCN3-LABEL: atomic_xor_i32_offset: 4815; GCN3: ; %bb.0: ; %entry 4816; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4817; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 4818; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4819; GCN3-NEXT: v_mov_b32_e32 v0, s0 4820; GCN3-NEXT: v_mov_b32_e32 v1, s1 4821; GCN3-NEXT: v_mov_b32_e32 v2, s2 4822; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 4823; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4824; GCN3-NEXT: buffer_wbinvl1_vol 4825; GCN3-NEXT: s_endpgm 4826entry: 4827 %gep = getelementptr i32, ptr %out, i32 4 4828 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst 4829 ret void 4830} 4831 4832define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 4833; GCN1-LABEL: atomic_xor_i32_ret_offset: 4834; GCN1: ; %bb.0: ; %entry 4835; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4836; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 4837; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4838; GCN1-NEXT: s_add_u32 s0, s0, 16 4839; GCN1-NEXT: s_addc_u32 s1, s1, 0 4840; GCN1-NEXT: v_mov_b32_e32 v0, s0 4841; GCN1-NEXT: v_mov_b32_e32 v1, s1 4842; GCN1-NEXT: v_mov_b32_e32 v2, s4 4843; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 4844; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4845; GCN1-NEXT: buffer_wbinvl1_vol 4846; GCN1-NEXT: v_mov_b32_e32 v0, s2 4847; GCN1-NEXT: v_mov_b32_e32 v1, s3 4848; GCN1-NEXT: flat_store_dword v[0:1], v2 4849; GCN1-NEXT: s_endpgm 4850; 4851; GCN2-LABEL: atomic_xor_i32_ret_offset: 4852; GCN2: ; %bb.0: ; %entry 4853; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4854; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 4855; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4856; GCN2-NEXT: s_add_u32 s0, s0, 16 4857; GCN2-NEXT: s_addc_u32 s1, s1, 0 4858; GCN2-NEXT: v_mov_b32_e32 v0, s0 4859; GCN2-NEXT: v_mov_b32_e32 v1, s1 4860; GCN2-NEXT: v_mov_b32_e32 v2, s4 4861; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 4862; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4863; GCN2-NEXT: buffer_wbinvl1_vol 4864; GCN2-NEXT: v_mov_b32_e32 v0, s2 4865; GCN2-NEXT: v_mov_b32_e32 v1, s3 4866; GCN2-NEXT: flat_store_dword v[0:1], v2 4867; GCN2-NEXT: s_endpgm 4868; 4869; GCN3-LABEL: atomic_xor_i32_ret_offset: 4870; GCN3: ; %bb.0: ; %entry 4871; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4872; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 4873; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4874; GCN3-NEXT: v_mov_b32_e32 v0, s0 4875; GCN3-NEXT: v_mov_b32_e32 v1, s1 4876; GCN3-NEXT: v_mov_b32_e32 v2, s6 4877; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc 4878; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4879; GCN3-NEXT: buffer_wbinvl1_vol 4880; GCN3-NEXT: v_mov_b32_e32 v0, s2 4881; GCN3-NEXT: v_mov_b32_e32 v1, s3 4882; GCN3-NEXT: flat_store_dword v[0:1], v2 4883; GCN3-NEXT: s_endpgm 4884entry: 4885 %gep = getelementptr i32, ptr %out, i32 4 4886 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst 4887 store i32 %val, ptr %out2 4888 ret void 4889} 4890 4891define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { 4892; GCN1-LABEL: atomic_xor_i32_addr64_offset: 4893; GCN1: ; %bb.0: ; %entry 4894; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 4895; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 4896; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 4897; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4898; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4899; GCN1-NEXT: s_add_u32 s0, s2, s0 4900; GCN1-NEXT: s_addc_u32 s1, s3, s1 4901; GCN1-NEXT: s_add_u32 s0, s0, 16 4902; GCN1-NEXT: s_addc_u32 s1, s1, 0 4903; GCN1-NEXT: v_mov_b32_e32 v0, s0 4904; GCN1-NEXT: v_mov_b32_e32 v1, s1 4905; GCN1-NEXT: v_mov_b32_e32 v2, s4 4906; GCN1-NEXT: flat_atomic_xor v[0:1], v2 4907; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4908; GCN1-NEXT: buffer_wbinvl1_vol 4909; GCN1-NEXT: s_endpgm 4910; 4911; GCN2-LABEL: atomic_xor_i32_addr64_offset: 4912; GCN2: ; %bb.0: ; %entry 4913; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4914; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4915; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 4916; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4917; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4918; GCN2-NEXT: s_add_u32 s0, s2, s0 4919; GCN2-NEXT: s_addc_u32 s1, s3, s1 4920; GCN2-NEXT: s_add_u32 s0, s0, 16 4921; GCN2-NEXT: s_addc_u32 s1, s1, 0 4922; GCN2-NEXT: v_mov_b32_e32 v0, s0 4923; GCN2-NEXT: v_mov_b32_e32 v1, s1 4924; GCN2-NEXT: v_mov_b32_e32 v2, s4 4925; GCN2-NEXT: flat_atomic_xor v[0:1], v2 4926; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4927; GCN2-NEXT: buffer_wbinvl1_vol 4928; GCN2-NEXT: s_endpgm 4929; 4930; GCN3-LABEL: atomic_xor_i32_addr64_offset: 4931; GCN3: ; %bb.0: ; %entry 4932; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 4933; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4934; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 4935; GCN3-NEXT: s_waitcnt lgkmcnt(0) 4936; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 4937; GCN3-NEXT: s_add_u32 s0, s2, s0 4938; GCN3-NEXT: s_addc_u32 s1, s3, s1 4939; GCN3-NEXT: v_mov_b32_e32 v0, s0 4940; GCN3-NEXT: v_mov_b32_e32 v1, s1 4941; GCN3-NEXT: v_mov_b32_e32 v2, s6 4942; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 4943; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4944; GCN3-NEXT: buffer_wbinvl1_vol 4945; GCN3-NEXT: s_endpgm 4946entry: 4947 %ptr = getelementptr i32, ptr %out, i64 %index 4948 %gep = getelementptr i32, ptr %ptr, i32 4 4949 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst 4950 ret void 4951} 4952 4953define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 4954; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: 4955; GCN1: ; %bb.0: ; %entry 4956; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 4957; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4958; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 4959; GCN1-NEXT: s_waitcnt lgkmcnt(0) 4960; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4961; GCN1-NEXT: s_add_u32 s0, s0, s4 4962; GCN1-NEXT: s_addc_u32 s1, s1, s5 4963; GCN1-NEXT: s_add_u32 s0, s0, 16 4964; GCN1-NEXT: s_addc_u32 s1, s1, 0 4965; GCN1-NEXT: v_mov_b32_e32 v0, s0 4966; GCN1-NEXT: v_mov_b32_e32 v1, s1 4967; GCN1-NEXT: v_mov_b32_e32 v2, s8 4968; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 4969; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4970; GCN1-NEXT: buffer_wbinvl1_vol 4971; GCN1-NEXT: v_mov_b32_e32 v0, s2 4972; GCN1-NEXT: v_mov_b32_e32 v1, s3 4973; GCN1-NEXT: flat_store_dword v[0:1], v2 4974; GCN1-NEXT: s_endpgm 4975; 4976; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: 4977; GCN2: ; %bb.0: ; %entry 4978; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4979; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4980; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 4981; GCN2-NEXT: s_waitcnt lgkmcnt(0) 4982; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 4983; GCN2-NEXT: s_add_u32 s0, s0, s4 4984; GCN2-NEXT: s_addc_u32 s1, s1, s5 4985; GCN2-NEXT: s_add_u32 s0, s0, 16 4986; GCN2-NEXT: s_addc_u32 s1, s1, 0 4987; GCN2-NEXT: v_mov_b32_e32 v0, s0 4988; GCN2-NEXT: v_mov_b32_e32 v1, s1 4989; GCN2-NEXT: v_mov_b32_e32 v2, s8 4990; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 4991; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4992; GCN2-NEXT: buffer_wbinvl1_vol 4993; GCN2-NEXT: v_mov_b32_e32 v0, s2 4994; GCN2-NEXT: v_mov_b32_e32 v1, s3 4995; GCN2-NEXT: flat_store_dword v[0:1], v2 4996; GCN2-NEXT: s_endpgm 4997; 4998; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: 4999; GCN3: ; %bb.0: ; %entry 5000; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 5001; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5002; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 5003; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5004; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5005; GCN3-NEXT: s_add_u32 s0, s0, s4 5006; GCN3-NEXT: s_addc_u32 s1, s1, s5 5007; GCN3-NEXT: v_mov_b32_e32 v0, s0 5008; GCN3-NEXT: v_mov_b32_e32 v1, s1 5009; GCN3-NEXT: v_mov_b32_e32 v2, s8 5010; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc 5011; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5012; GCN3-NEXT: buffer_wbinvl1_vol 5013; GCN3-NEXT: v_mov_b32_e32 v0, s2 5014; GCN3-NEXT: v_mov_b32_e32 v1, s3 5015; GCN3-NEXT: flat_store_dword v[0:1], v2 5016; GCN3-NEXT: s_endpgm 5017entry: 5018 %ptr = getelementptr i32, ptr %out, i64 %index 5019 %gep = getelementptr i32, ptr %ptr, i32 4 5020 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst 5021 store i32 %val, ptr %out2 5022 ret void 5023} 5024 5025define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { 5026; GCN1-LABEL: atomic_xor_i32: 5027; GCN1: ; %bb.0: ; %entry 5028; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5029; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 5030; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5031; GCN1-NEXT: v_mov_b32_e32 v0, s0 5032; GCN1-NEXT: v_mov_b32_e32 v1, s1 5033; GCN1-NEXT: v_mov_b32_e32 v2, s2 5034; GCN1-NEXT: flat_atomic_xor v[0:1], v2 5035; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5036; GCN1-NEXT: buffer_wbinvl1_vol 5037; GCN1-NEXT: s_endpgm 5038; 5039; GCN2-LABEL: atomic_xor_i32: 5040; GCN2: ; %bb.0: ; %entry 5041; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5042; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 5043; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5044; GCN2-NEXT: v_mov_b32_e32 v0, s0 5045; GCN2-NEXT: v_mov_b32_e32 v1, s1 5046; GCN2-NEXT: v_mov_b32_e32 v2, s2 5047; GCN2-NEXT: flat_atomic_xor v[0:1], v2 5048; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5049; GCN2-NEXT: buffer_wbinvl1_vol 5050; GCN2-NEXT: s_endpgm 5051; 5052; GCN3-LABEL: atomic_xor_i32: 5053; GCN3: ; %bb.0: ; %entry 5054; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5055; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 5056; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5057; GCN3-NEXT: v_mov_b32_e32 v0, s0 5058; GCN3-NEXT: v_mov_b32_e32 v1, s1 5059; GCN3-NEXT: v_mov_b32_e32 v2, s2 5060; GCN3-NEXT: flat_atomic_xor v[0:1], v2 5061; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5062; GCN3-NEXT: buffer_wbinvl1_vol 5063; GCN3-NEXT: s_endpgm 5064entry: 5065 %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst 5066 ret void 5067} 5068 5069define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { 5070; GCN1-LABEL: atomic_xor_i32_ret: 5071; GCN1: ; %bb.0: ; %entry 5072; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5073; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 5074; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5075; GCN1-NEXT: v_mov_b32_e32 v0, s0 5076; GCN1-NEXT: v_mov_b32_e32 v1, s1 5077; GCN1-NEXT: v_mov_b32_e32 v2, s4 5078; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 5079; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5080; GCN1-NEXT: buffer_wbinvl1_vol 5081; GCN1-NEXT: v_mov_b32_e32 v0, s2 5082; GCN1-NEXT: v_mov_b32_e32 v1, s3 5083; GCN1-NEXT: flat_store_dword v[0:1], v2 5084; GCN1-NEXT: s_endpgm 5085; 5086; GCN2-LABEL: atomic_xor_i32_ret: 5087; GCN2: ; %bb.0: ; %entry 5088; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5089; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 5090; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5091; GCN2-NEXT: v_mov_b32_e32 v0, s0 5092; GCN2-NEXT: v_mov_b32_e32 v1, s1 5093; GCN2-NEXT: v_mov_b32_e32 v2, s4 5094; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 5095; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5096; GCN2-NEXT: buffer_wbinvl1_vol 5097; GCN2-NEXT: v_mov_b32_e32 v0, s2 5098; GCN2-NEXT: v_mov_b32_e32 v1, s3 5099; GCN2-NEXT: flat_store_dword v[0:1], v2 5100; GCN2-NEXT: s_endpgm 5101; 5102; GCN3-LABEL: atomic_xor_i32_ret: 5103; GCN3: ; %bb.0: ; %entry 5104; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5105; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 5106; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5107; GCN3-NEXT: v_mov_b32_e32 v0, s0 5108; GCN3-NEXT: v_mov_b32_e32 v1, s1 5109; GCN3-NEXT: v_mov_b32_e32 v2, s6 5110; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 5111; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5112; GCN3-NEXT: buffer_wbinvl1_vol 5113; GCN3-NEXT: v_mov_b32_e32 v0, s2 5114; GCN3-NEXT: v_mov_b32_e32 v1, s3 5115; GCN3-NEXT: flat_store_dword v[0:1], v2 5116; GCN3-NEXT: s_endpgm 5117entry: 5118 %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst 5119 store i32 %val, ptr %out2 5120 ret void 5121} 5122 5123define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { 5124; GCN1-LABEL: atomic_xor_i32_addr64: 5125; GCN1: ; %bb.0: ; %entry 5126; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 5127; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 5128; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 5129; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5130; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 5131; GCN1-NEXT: s_add_u32 s0, s2, s0 5132; GCN1-NEXT: s_addc_u32 s1, s3, s1 5133; GCN1-NEXT: v_mov_b32_e32 v0, s0 5134; GCN1-NEXT: v_mov_b32_e32 v1, s1 5135; GCN1-NEXT: v_mov_b32_e32 v2, s4 5136; GCN1-NEXT: flat_atomic_xor v[0:1], v2 5137; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5138; GCN1-NEXT: buffer_wbinvl1_vol 5139; GCN1-NEXT: s_endpgm 5140; 5141; GCN2-LABEL: atomic_xor_i32_addr64: 5142; GCN2: ; %bb.0: ; %entry 5143; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 5144; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 5145; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 5146; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5147; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 5148; GCN2-NEXT: s_add_u32 s0, s2, s0 5149; GCN2-NEXT: s_addc_u32 s1, s3, s1 5150; GCN2-NEXT: v_mov_b32_e32 v0, s0 5151; GCN2-NEXT: v_mov_b32_e32 v1, s1 5152; GCN2-NEXT: v_mov_b32_e32 v2, s4 5153; GCN2-NEXT: flat_atomic_xor v[0:1], v2 5154; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5155; GCN2-NEXT: buffer_wbinvl1_vol 5156; GCN2-NEXT: s_endpgm 5157; 5158; GCN3-LABEL: atomic_xor_i32_addr64: 5159; GCN3: ; %bb.0: ; %entry 5160; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 5161; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 5162; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 5163; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5164; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 5165; GCN3-NEXT: s_add_u32 s0, s2, s0 5166; GCN3-NEXT: s_addc_u32 s1, s3, s1 5167; GCN3-NEXT: v_mov_b32_e32 v0, s0 5168; GCN3-NEXT: v_mov_b32_e32 v1, s1 5169; GCN3-NEXT: v_mov_b32_e32 v2, s6 5170; GCN3-NEXT: flat_atomic_xor v[0:1], v2 5171; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5172; GCN3-NEXT: buffer_wbinvl1_vol 5173; GCN3-NEXT: s_endpgm 5174entry: 5175 %ptr = getelementptr i32, ptr %out, i64 %index 5176 %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst 5177 ret void 5178} 5179 5180define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 5181; GCN1-LABEL: atomic_xor_i32_ret_addr64: 5182; GCN1: ; %bb.0: ; %entry 5183; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 5184; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5185; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 5186; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5187; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5188; GCN1-NEXT: s_add_u32 s0, s0, s4 5189; GCN1-NEXT: s_addc_u32 s1, s1, s5 5190; GCN1-NEXT: v_mov_b32_e32 v0, s0 5191; GCN1-NEXT: v_mov_b32_e32 v1, s1 5192; GCN1-NEXT: v_mov_b32_e32 v2, s8 5193; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 5194; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5195; GCN1-NEXT: buffer_wbinvl1_vol 5196; GCN1-NEXT: v_mov_b32_e32 v0, s2 5197; GCN1-NEXT: v_mov_b32_e32 v1, s3 5198; GCN1-NEXT: flat_store_dword v[0:1], v2 5199; GCN1-NEXT: s_endpgm 5200; 5201; GCN2-LABEL: atomic_xor_i32_ret_addr64: 5202; GCN2: ; %bb.0: ; %entry 5203; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 5204; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5205; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 5206; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5207; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5208; GCN2-NEXT: s_add_u32 s0, s0, s4 5209; GCN2-NEXT: s_addc_u32 s1, s1, s5 5210; GCN2-NEXT: v_mov_b32_e32 v0, s0 5211; GCN2-NEXT: v_mov_b32_e32 v1, s1 5212; GCN2-NEXT: v_mov_b32_e32 v2, s8 5213; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 5214; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5215; GCN2-NEXT: buffer_wbinvl1_vol 5216; GCN2-NEXT: v_mov_b32_e32 v0, s2 5217; GCN2-NEXT: v_mov_b32_e32 v1, s3 5218; GCN2-NEXT: flat_store_dword v[0:1], v2 5219; GCN2-NEXT: s_endpgm 5220; 5221; GCN3-LABEL: atomic_xor_i32_ret_addr64: 5222; GCN3: ; %bb.0: ; %entry 5223; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 5224; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5225; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 5226; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5227; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5228; GCN3-NEXT: s_add_u32 s0, s0, s4 5229; GCN3-NEXT: s_addc_u32 s1, s1, s5 5230; GCN3-NEXT: v_mov_b32_e32 v0, s0 5231; GCN3-NEXT: v_mov_b32_e32 v1, s1 5232; GCN3-NEXT: v_mov_b32_e32 v2, s8 5233; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc 5234; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5235; GCN3-NEXT: buffer_wbinvl1_vol 5236; GCN3-NEXT: v_mov_b32_e32 v0, s2 5237; GCN3-NEXT: v_mov_b32_e32 v1, s3 5238; GCN3-NEXT: flat_store_dword v[0:1], v2 5239; GCN3-NEXT: s_endpgm 5240entry: 5241 %ptr = getelementptr i32, ptr %out, i64 %index 5242 %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst 5243 store i32 %val, ptr %out2 5244 ret void 5245} 5246 5247define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { 5248; GCN1-LABEL: atomic_load_i32_offset: 5249; GCN1: ; %bb.0: ; %entry 5250; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5251; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5252; GCN1-NEXT: s_add_u32 s0, s0, 16 5253; GCN1-NEXT: s_addc_u32 s1, s1, 0 5254; GCN1-NEXT: v_mov_b32_e32 v0, s0 5255; GCN1-NEXT: v_mov_b32_e32 v1, s1 5256; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5257; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5258; GCN1-NEXT: buffer_wbinvl1_vol 5259; GCN1-NEXT: v_mov_b32_e32 v0, s2 5260; GCN1-NEXT: v_mov_b32_e32 v1, s3 5261; GCN1-NEXT: flat_store_dword v[0:1], v2 5262; GCN1-NEXT: s_endpgm 5263; 5264; GCN2-LABEL: atomic_load_i32_offset: 5265; GCN2: ; %bb.0: ; %entry 5266; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5267; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5268; GCN2-NEXT: s_add_u32 s0, s0, 16 5269; GCN2-NEXT: s_addc_u32 s1, s1, 0 5270; GCN2-NEXT: v_mov_b32_e32 v0, s0 5271; GCN2-NEXT: v_mov_b32_e32 v1, s1 5272; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5273; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5274; GCN2-NEXT: buffer_wbinvl1_vol 5275; GCN2-NEXT: v_mov_b32_e32 v0, s2 5276; GCN2-NEXT: v_mov_b32_e32 v1, s3 5277; GCN2-NEXT: flat_store_dword v[0:1], v2 5278; GCN2-NEXT: s_endpgm 5279; 5280; GCN3-LABEL: atomic_load_i32_offset: 5281; GCN3: ; %bb.0: ; %entry 5282; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5283; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5284; GCN3-NEXT: v_mov_b32_e32 v0, s0 5285; GCN3-NEXT: v_mov_b32_e32 v1, s1 5286; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc 5287; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5288; GCN3-NEXT: buffer_wbinvl1_vol 5289; GCN3-NEXT: v_mov_b32_e32 v0, s2 5290; GCN3-NEXT: v_mov_b32_e32 v1, s3 5291; GCN3-NEXT: flat_store_dword v[0:1], v2 5292; GCN3-NEXT: s_endpgm 5293entry: 5294 %gep = getelementptr i32, ptr %in, i32 4 5295 %val = load atomic i32, ptr %gep seq_cst, align 4 5296 store i32 %val, ptr %out 5297 ret void 5298} 5299 5300define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { 5301; GCN1-LABEL: atomic_load_i32: 5302; GCN1: ; %bb.0: ; %entry 5303; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5304; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5305; GCN1-NEXT: v_mov_b32_e32 v0, s0 5306; GCN1-NEXT: v_mov_b32_e32 v1, s1 5307; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5308; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5309; GCN1-NEXT: buffer_wbinvl1_vol 5310; GCN1-NEXT: v_mov_b32_e32 v0, s2 5311; GCN1-NEXT: v_mov_b32_e32 v1, s3 5312; GCN1-NEXT: flat_store_dword v[0:1], v2 5313; GCN1-NEXT: s_endpgm 5314; 5315; GCN2-LABEL: atomic_load_i32: 5316; GCN2: ; %bb.0: ; %entry 5317; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5318; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5319; GCN2-NEXT: v_mov_b32_e32 v0, s0 5320; GCN2-NEXT: v_mov_b32_e32 v1, s1 5321; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5322; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5323; GCN2-NEXT: buffer_wbinvl1_vol 5324; GCN2-NEXT: v_mov_b32_e32 v0, s2 5325; GCN2-NEXT: v_mov_b32_e32 v1, s3 5326; GCN2-NEXT: flat_store_dword v[0:1], v2 5327; GCN2-NEXT: s_endpgm 5328; 5329; GCN3-LABEL: atomic_load_i32: 5330; GCN3: ; %bb.0: ; %entry 5331; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5332; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5333; GCN3-NEXT: v_mov_b32_e32 v0, s0 5334; GCN3-NEXT: v_mov_b32_e32 v1, s1 5335; GCN3-NEXT: flat_load_dword v2, v[0:1] glc 5336; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5337; GCN3-NEXT: buffer_wbinvl1_vol 5338; GCN3-NEXT: v_mov_b32_e32 v0, s2 5339; GCN3-NEXT: v_mov_b32_e32 v1, s3 5340; GCN3-NEXT: flat_store_dword v[0:1], v2 5341; GCN3-NEXT: s_endpgm 5342entry: 5343 %val = load atomic i32, ptr %in seq_cst, align 4 5344 store i32 %val, ptr %out 5345 ret void 5346} 5347 5348define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { 5349; GCN1-LABEL: atomic_load_i32_addr64_offset: 5350; GCN1: ; %bb.0: ; %entry 5351; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5352; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5353; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5354; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5355; GCN1-NEXT: s_add_u32 s0, s0, s4 5356; GCN1-NEXT: s_addc_u32 s1, s1, s5 5357; GCN1-NEXT: s_add_u32 s0, s0, 16 5358; GCN1-NEXT: s_addc_u32 s1, s1, 0 5359; GCN1-NEXT: v_mov_b32_e32 v0, s0 5360; GCN1-NEXT: v_mov_b32_e32 v1, s1 5361; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5362; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5363; GCN1-NEXT: buffer_wbinvl1_vol 5364; GCN1-NEXT: v_mov_b32_e32 v0, s2 5365; GCN1-NEXT: v_mov_b32_e32 v1, s3 5366; GCN1-NEXT: flat_store_dword v[0:1], v2 5367; GCN1-NEXT: s_endpgm 5368; 5369; GCN2-LABEL: atomic_load_i32_addr64_offset: 5370; GCN2: ; %bb.0: ; %entry 5371; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5372; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5373; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5374; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5375; GCN2-NEXT: s_add_u32 s0, s0, s4 5376; GCN2-NEXT: s_addc_u32 s1, s1, s5 5377; GCN2-NEXT: s_add_u32 s0, s0, 16 5378; GCN2-NEXT: s_addc_u32 s1, s1, 0 5379; GCN2-NEXT: v_mov_b32_e32 v0, s0 5380; GCN2-NEXT: v_mov_b32_e32 v1, s1 5381; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5382; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5383; GCN2-NEXT: buffer_wbinvl1_vol 5384; GCN2-NEXT: v_mov_b32_e32 v0, s2 5385; GCN2-NEXT: v_mov_b32_e32 v1, s3 5386; GCN2-NEXT: flat_store_dword v[0:1], v2 5387; GCN2-NEXT: s_endpgm 5388; 5389; GCN3-LABEL: atomic_load_i32_addr64_offset: 5390; GCN3: ; %bb.0: ; %entry 5391; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5392; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5393; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5394; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5395; GCN3-NEXT: s_add_u32 s0, s0, s4 5396; GCN3-NEXT: s_addc_u32 s1, s1, s5 5397; GCN3-NEXT: v_mov_b32_e32 v0, s0 5398; GCN3-NEXT: v_mov_b32_e32 v1, s1 5399; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc 5400; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5401; GCN3-NEXT: buffer_wbinvl1_vol 5402; GCN3-NEXT: v_mov_b32_e32 v0, s2 5403; GCN3-NEXT: v_mov_b32_e32 v1, s3 5404; GCN3-NEXT: flat_store_dword v[0:1], v2 5405; GCN3-NEXT: s_endpgm 5406entry: 5407 %ptr = getelementptr i32, ptr %in, i64 %index 5408 %gep = getelementptr i32, ptr %ptr, i32 4 5409 %val = load atomic i32, ptr %gep seq_cst, align 4 5410 store i32 %val, ptr %out 5411 ret void 5412} 5413 5414define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { 5415; GCN1-LABEL: atomic_load_i32_addr64: 5416; GCN1: ; %bb.0: ; %entry 5417; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5418; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5419; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5420; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5421; GCN1-NEXT: s_add_u32 s0, s0, s4 5422; GCN1-NEXT: s_addc_u32 s1, s1, s5 5423; GCN1-NEXT: v_mov_b32_e32 v0, s0 5424; GCN1-NEXT: v_mov_b32_e32 v1, s1 5425; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5426; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5427; GCN1-NEXT: buffer_wbinvl1_vol 5428; GCN1-NEXT: v_mov_b32_e32 v0, s2 5429; GCN1-NEXT: v_mov_b32_e32 v1, s3 5430; GCN1-NEXT: flat_store_dword v[0:1], v2 5431; GCN1-NEXT: s_endpgm 5432; 5433; GCN2-LABEL: atomic_load_i32_addr64: 5434; GCN2: ; %bb.0: ; %entry 5435; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5436; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5437; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5438; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5439; GCN2-NEXT: s_add_u32 s0, s0, s4 5440; GCN2-NEXT: s_addc_u32 s1, s1, s5 5441; GCN2-NEXT: v_mov_b32_e32 v0, s0 5442; GCN2-NEXT: v_mov_b32_e32 v1, s1 5443; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5444; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5445; GCN2-NEXT: buffer_wbinvl1_vol 5446; GCN2-NEXT: v_mov_b32_e32 v0, s2 5447; GCN2-NEXT: v_mov_b32_e32 v1, s3 5448; GCN2-NEXT: flat_store_dword v[0:1], v2 5449; GCN2-NEXT: s_endpgm 5450; 5451; GCN3-LABEL: atomic_load_i32_addr64: 5452; GCN3: ; %bb.0: ; %entry 5453; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5454; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5455; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5456; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5457; GCN3-NEXT: s_add_u32 s0, s0, s4 5458; GCN3-NEXT: s_addc_u32 s1, s1, s5 5459; GCN3-NEXT: v_mov_b32_e32 v0, s0 5460; GCN3-NEXT: v_mov_b32_e32 v1, s1 5461; GCN3-NEXT: flat_load_dword v2, v[0:1] glc 5462; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5463; GCN3-NEXT: buffer_wbinvl1_vol 5464; GCN3-NEXT: v_mov_b32_e32 v0, s2 5465; GCN3-NEXT: v_mov_b32_e32 v1, s3 5466; GCN3-NEXT: flat_store_dword v[0:1], v2 5467; GCN3-NEXT: s_endpgm 5468entry: 5469 %ptr = getelementptr i32, ptr %in, i64 %index 5470 %val = load atomic i32, ptr %ptr seq_cst, align 4 5471 store i32 %val, ptr %out 5472 ret void 5473} 5474 5475define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { 5476; GCN1-LABEL: atomic_store_i32_offset: 5477; GCN1: ; %bb.0: ; %entry 5478; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 5479; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 5480; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5481; GCN1-NEXT: s_add_u32 s0, s0, 16 5482; GCN1-NEXT: s_addc_u32 s1, s1, 0 5483; GCN1-NEXT: v_mov_b32_e32 v0, s0 5484; GCN1-NEXT: v_mov_b32_e32 v1, s1 5485; GCN1-NEXT: v_mov_b32_e32 v2, s2 5486; GCN1-NEXT: flat_store_dword v[0:1], v2 5487; GCN1-NEXT: s_endpgm 5488; 5489; GCN2-LABEL: atomic_store_i32_offset: 5490; GCN2: ; %bb.0: ; %entry 5491; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5492; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 5493; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5494; GCN2-NEXT: s_add_u32 s0, s0, 16 5495; GCN2-NEXT: s_addc_u32 s1, s1, 0 5496; GCN2-NEXT: v_mov_b32_e32 v0, s0 5497; GCN2-NEXT: v_mov_b32_e32 v1, s1 5498; GCN2-NEXT: v_mov_b32_e32 v2, s2 5499; GCN2-NEXT: flat_store_dword v[0:1], v2 5500; GCN2-NEXT: s_endpgm 5501; 5502; GCN3-LABEL: atomic_store_i32_offset: 5503; GCN3: ; %bb.0: ; %entry 5504; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5505; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 5506; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5507; GCN3-NEXT: v_mov_b32_e32 v0, s0 5508; GCN3-NEXT: v_mov_b32_e32 v1, s1 5509; GCN3-NEXT: v_mov_b32_e32 v2, s2 5510; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 5511; GCN3-NEXT: s_endpgm 5512entry: 5513 %gep = getelementptr i32, ptr %out, i32 4 5514 store atomic i32 %in, ptr %gep seq_cst, align 4 5515 ret void 5516} 5517 5518define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { 5519; GCN1-LABEL: atomic_store_i32: 5520; GCN1: ; %bb.0: ; %entry 5521; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 5522; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 5523; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5524; GCN1-NEXT: v_mov_b32_e32 v0, s0 5525; GCN1-NEXT: v_mov_b32_e32 v1, s1 5526; GCN1-NEXT: v_mov_b32_e32 v2, s2 5527; GCN1-NEXT: flat_store_dword v[0:1], v2 5528; GCN1-NEXT: s_endpgm 5529; 5530; GCN2-LABEL: atomic_store_i32: 5531; GCN2: ; %bb.0: ; %entry 5532; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5533; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 5534; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5535; GCN2-NEXT: v_mov_b32_e32 v0, s0 5536; GCN2-NEXT: v_mov_b32_e32 v1, s1 5537; GCN2-NEXT: v_mov_b32_e32 v2, s2 5538; GCN2-NEXT: flat_store_dword v[0:1], v2 5539; GCN2-NEXT: s_endpgm 5540; 5541; GCN3-LABEL: atomic_store_i32: 5542; GCN3: ; %bb.0: ; %entry 5543; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5544; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 5545; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5546; GCN3-NEXT: v_mov_b32_e32 v0, s0 5547; GCN3-NEXT: v_mov_b32_e32 v1, s1 5548; GCN3-NEXT: v_mov_b32_e32 v2, s2 5549; GCN3-NEXT: flat_store_dword v[0:1], v2 5550; GCN3-NEXT: s_endpgm 5551entry: 5552 store atomic i32 %in, ptr %out seq_cst, align 4 5553 ret void 5554} 5555 5556define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { 5557; GCN1-LABEL: atomic_store_i32_addr64_offset: 5558; GCN1: ; %bb.0: ; %entry 5559; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 5560; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 5561; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5562; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5563; GCN1-NEXT: s_add_u32 s0, s0, s2 5564; GCN1-NEXT: s_addc_u32 s1, s1, s3 5565; GCN1-NEXT: s_add_u32 s0, s0, 16 5566; GCN1-NEXT: s_addc_u32 s1, s1, 0 5567; GCN1-NEXT: v_mov_b32_e32 v0, s0 5568; GCN1-NEXT: v_mov_b32_e32 v1, s1 5569; GCN1-NEXT: v_mov_b32_e32 v2, s4 5570; GCN1-NEXT: flat_store_dword v[0:1], v2 5571; GCN1-NEXT: s_endpgm 5572; 5573; GCN2-LABEL: atomic_store_i32_addr64_offset: 5574; GCN2: ; %bb.0: ; %entry 5575; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 5576; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 5577; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5578; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5579; GCN2-NEXT: s_add_u32 s0, s0, s2 5580; GCN2-NEXT: s_addc_u32 s1, s1, s3 5581; GCN2-NEXT: s_add_u32 s0, s0, 16 5582; GCN2-NEXT: s_addc_u32 s1, s1, 0 5583; GCN2-NEXT: v_mov_b32_e32 v0, s0 5584; GCN2-NEXT: v_mov_b32_e32 v1, s1 5585; GCN2-NEXT: v_mov_b32_e32 v2, s4 5586; GCN2-NEXT: flat_store_dword v[0:1], v2 5587; GCN2-NEXT: s_endpgm 5588; 5589; GCN3-LABEL: atomic_store_i32_addr64_offset: 5590; GCN3: ; %bb.0: ; %entry 5591; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 5592; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 5593; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5594; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5595; GCN3-NEXT: s_add_u32 s0, s0, s2 5596; GCN3-NEXT: s_addc_u32 s1, s1, s3 5597; GCN3-NEXT: v_mov_b32_e32 v0, s0 5598; GCN3-NEXT: v_mov_b32_e32 v1, s1 5599; GCN3-NEXT: v_mov_b32_e32 v2, s6 5600; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 5601; GCN3-NEXT: s_endpgm 5602entry: 5603 %ptr = getelementptr i32, ptr %out, i64 %index 5604 %gep = getelementptr i32, ptr %ptr, i32 4 5605 store atomic i32 %in, ptr %gep seq_cst, align 4 5606 ret void 5607} 5608 5609define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { 5610; GCN1-LABEL: atomic_store_i32_addr64: 5611; GCN1: ; %bb.0: ; %entry 5612; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 5613; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 5614; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5615; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5616; GCN1-NEXT: s_add_u32 s0, s0, s2 5617; GCN1-NEXT: s_addc_u32 s1, s1, s3 5618; GCN1-NEXT: v_mov_b32_e32 v0, s0 5619; GCN1-NEXT: v_mov_b32_e32 v1, s1 5620; GCN1-NEXT: v_mov_b32_e32 v2, s4 5621; GCN1-NEXT: flat_store_dword v[0:1], v2 5622; GCN1-NEXT: s_endpgm 5623; 5624; GCN2-LABEL: atomic_store_i32_addr64: 5625; GCN2: ; %bb.0: ; %entry 5626; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 5627; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 5628; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5629; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5630; GCN2-NEXT: s_add_u32 s0, s0, s2 5631; GCN2-NEXT: s_addc_u32 s1, s1, s3 5632; GCN2-NEXT: v_mov_b32_e32 v0, s0 5633; GCN2-NEXT: v_mov_b32_e32 v1, s1 5634; GCN2-NEXT: v_mov_b32_e32 v2, s4 5635; GCN2-NEXT: flat_store_dword v[0:1], v2 5636; GCN2-NEXT: s_endpgm 5637; 5638; GCN3-LABEL: atomic_store_i32_addr64: 5639; GCN3: ; %bb.0: ; %entry 5640; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 5641; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 5642; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5643; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5644; GCN3-NEXT: s_add_u32 s0, s0, s2 5645; GCN3-NEXT: s_addc_u32 s1, s1, s3 5646; GCN3-NEXT: v_mov_b32_e32 v0, s0 5647; GCN3-NEXT: v_mov_b32_e32 v1, s1 5648; GCN3-NEXT: v_mov_b32_e32 v2, s6 5649; GCN3-NEXT: flat_store_dword v[0:1], v2 5650; GCN3-NEXT: s_endpgm 5651entry: 5652 %ptr = getelementptr i32, ptr %out, i64 %index 5653 store atomic i32 %in, ptr %ptr seq_cst, align 4 5654 ret void 5655} 5656 5657define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { 5658; GCN1-LABEL: atomic_load_f32_offset: 5659; GCN1: ; %bb.0: ; %entry 5660; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5661; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5662; GCN1-NEXT: s_add_u32 s0, s0, 16 5663; GCN1-NEXT: s_addc_u32 s1, s1, 0 5664; GCN1-NEXT: v_mov_b32_e32 v0, s0 5665; GCN1-NEXT: v_mov_b32_e32 v1, s1 5666; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5667; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5668; GCN1-NEXT: buffer_wbinvl1_vol 5669; GCN1-NEXT: v_mov_b32_e32 v0, s2 5670; GCN1-NEXT: v_mov_b32_e32 v1, s3 5671; GCN1-NEXT: flat_store_dword v[0:1], v2 5672; GCN1-NEXT: s_endpgm 5673; 5674; GCN2-LABEL: atomic_load_f32_offset: 5675; GCN2: ; %bb.0: ; %entry 5676; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5677; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5678; GCN2-NEXT: s_add_u32 s0, s0, 16 5679; GCN2-NEXT: s_addc_u32 s1, s1, 0 5680; GCN2-NEXT: v_mov_b32_e32 v0, s0 5681; GCN2-NEXT: v_mov_b32_e32 v1, s1 5682; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5683; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5684; GCN2-NEXT: buffer_wbinvl1_vol 5685; GCN2-NEXT: v_mov_b32_e32 v0, s2 5686; GCN2-NEXT: v_mov_b32_e32 v1, s3 5687; GCN2-NEXT: flat_store_dword v[0:1], v2 5688; GCN2-NEXT: s_endpgm 5689; 5690; GCN3-LABEL: atomic_load_f32_offset: 5691; GCN3: ; %bb.0: ; %entry 5692; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5693; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5694; GCN3-NEXT: v_mov_b32_e32 v0, s0 5695; GCN3-NEXT: v_mov_b32_e32 v1, s1 5696; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc 5697; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5698; GCN3-NEXT: buffer_wbinvl1_vol 5699; GCN3-NEXT: v_mov_b32_e32 v0, s2 5700; GCN3-NEXT: v_mov_b32_e32 v1, s3 5701; GCN3-NEXT: flat_store_dword v[0:1], v2 5702; GCN3-NEXT: s_endpgm 5703entry: 5704 %gep = getelementptr float, ptr %in, i32 4 5705 %val = load atomic float, ptr %gep seq_cst, align 4 5706 store float %val, ptr %out 5707 ret void 5708} 5709 5710define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { 5711; GCN1-LABEL: atomic_load_f32: 5712; GCN1: ; %bb.0: ; %entry 5713; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5714; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5715; GCN1-NEXT: v_mov_b32_e32 v0, s0 5716; GCN1-NEXT: v_mov_b32_e32 v1, s1 5717; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5718; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5719; GCN1-NEXT: buffer_wbinvl1_vol 5720; GCN1-NEXT: v_mov_b32_e32 v0, s2 5721; GCN1-NEXT: v_mov_b32_e32 v1, s3 5722; GCN1-NEXT: flat_store_dword v[0:1], v2 5723; GCN1-NEXT: s_endpgm 5724; 5725; GCN2-LABEL: atomic_load_f32: 5726; GCN2: ; %bb.0: ; %entry 5727; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5728; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5729; GCN2-NEXT: v_mov_b32_e32 v0, s0 5730; GCN2-NEXT: v_mov_b32_e32 v1, s1 5731; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5732; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5733; GCN2-NEXT: buffer_wbinvl1_vol 5734; GCN2-NEXT: v_mov_b32_e32 v0, s2 5735; GCN2-NEXT: v_mov_b32_e32 v1, s3 5736; GCN2-NEXT: flat_store_dword v[0:1], v2 5737; GCN2-NEXT: s_endpgm 5738; 5739; GCN3-LABEL: atomic_load_f32: 5740; GCN3: ; %bb.0: ; %entry 5741; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5742; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5743; GCN3-NEXT: v_mov_b32_e32 v0, s0 5744; GCN3-NEXT: v_mov_b32_e32 v1, s1 5745; GCN3-NEXT: flat_load_dword v2, v[0:1] glc 5746; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5747; GCN3-NEXT: buffer_wbinvl1_vol 5748; GCN3-NEXT: v_mov_b32_e32 v0, s2 5749; GCN3-NEXT: v_mov_b32_e32 v1, s3 5750; GCN3-NEXT: flat_store_dword v[0:1], v2 5751; GCN3-NEXT: s_endpgm 5752entry: 5753 %val = load atomic float, ptr %in seq_cst, align 4 5754 store float %val, ptr %out 5755 ret void 5756} 5757 5758define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { 5759; GCN1-LABEL: atomic_load_f32_addr64_offset: 5760; GCN1: ; %bb.0: ; %entry 5761; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5762; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5763; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5764; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5765; GCN1-NEXT: s_add_u32 s0, s0, s4 5766; GCN1-NEXT: s_addc_u32 s1, s1, s5 5767; GCN1-NEXT: s_add_u32 s0, s0, 16 5768; GCN1-NEXT: s_addc_u32 s1, s1, 0 5769; GCN1-NEXT: v_mov_b32_e32 v0, s0 5770; GCN1-NEXT: v_mov_b32_e32 v1, s1 5771; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5772; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5773; GCN1-NEXT: buffer_wbinvl1_vol 5774; GCN1-NEXT: v_mov_b32_e32 v0, s2 5775; GCN1-NEXT: v_mov_b32_e32 v1, s3 5776; GCN1-NEXT: flat_store_dword v[0:1], v2 5777; GCN1-NEXT: s_endpgm 5778; 5779; GCN2-LABEL: atomic_load_f32_addr64_offset: 5780; GCN2: ; %bb.0: ; %entry 5781; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5782; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5783; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5784; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5785; GCN2-NEXT: s_add_u32 s0, s0, s4 5786; GCN2-NEXT: s_addc_u32 s1, s1, s5 5787; GCN2-NEXT: s_add_u32 s0, s0, 16 5788; GCN2-NEXT: s_addc_u32 s1, s1, 0 5789; GCN2-NEXT: v_mov_b32_e32 v0, s0 5790; GCN2-NEXT: v_mov_b32_e32 v1, s1 5791; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5792; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5793; GCN2-NEXT: buffer_wbinvl1_vol 5794; GCN2-NEXT: v_mov_b32_e32 v0, s2 5795; GCN2-NEXT: v_mov_b32_e32 v1, s3 5796; GCN2-NEXT: flat_store_dword v[0:1], v2 5797; GCN2-NEXT: s_endpgm 5798; 5799; GCN3-LABEL: atomic_load_f32_addr64_offset: 5800; GCN3: ; %bb.0: ; %entry 5801; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5802; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5803; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5804; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5805; GCN3-NEXT: s_add_u32 s0, s0, s4 5806; GCN3-NEXT: s_addc_u32 s1, s1, s5 5807; GCN3-NEXT: v_mov_b32_e32 v0, s0 5808; GCN3-NEXT: v_mov_b32_e32 v1, s1 5809; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc 5810; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5811; GCN3-NEXT: buffer_wbinvl1_vol 5812; GCN3-NEXT: v_mov_b32_e32 v0, s2 5813; GCN3-NEXT: v_mov_b32_e32 v1, s3 5814; GCN3-NEXT: flat_store_dword v[0:1], v2 5815; GCN3-NEXT: s_endpgm 5816entry: 5817 %ptr = getelementptr float, ptr %in, i64 %index 5818 %gep = getelementptr float, ptr %ptr, i32 4 5819 %val = load atomic float, ptr %gep seq_cst, align 4 5820 store float %val, ptr %out 5821 ret void 5822} 5823 5824define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { 5825; GCN1-LABEL: atomic_load_f32_addr64: 5826; GCN1: ; %bb.0: ; %entry 5827; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 5828; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5829; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5830; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5831; GCN1-NEXT: s_add_u32 s0, s0, s4 5832; GCN1-NEXT: s_addc_u32 s1, s1, s5 5833; GCN1-NEXT: v_mov_b32_e32 v0, s0 5834; GCN1-NEXT: v_mov_b32_e32 v1, s1 5835; GCN1-NEXT: flat_load_dword v2, v[0:1] glc 5836; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5837; GCN1-NEXT: buffer_wbinvl1_vol 5838; GCN1-NEXT: v_mov_b32_e32 v0, s2 5839; GCN1-NEXT: v_mov_b32_e32 v1, s3 5840; GCN1-NEXT: flat_store_dword v[0:1], v2 5841; GCN1-NEXT: s_endpgm 5842; 5843; GCN2-LABEL: atomic_load_f32_addr64: 5844; GCN2: ; %bb.0: ; %entry 5845; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5846; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5847; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5848; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5849; GCN2-NEXT: s_add_u32 s0, s0, s4 5850; GCN2-NEXT: s_addc_u32 s1, s1, s5 5851; GCN2-NEXT: v_mov_b32_e32 v0, s0 5852; GCN2-NEXT: v_mov_b32_e32 v1, s1 5853; GCN2-NEXT: flat_load_dword v2, v[0:1] glc 5854; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5855; GCN2-NEXT: buffer_wbinvl1_vol 5856; GCN2-NEXT: v_mov_b32_e32 v0, s2 5857; GCN2-NEXT: v_mov_b32_e32 v1, s3 5858; GCN2-NEXT: flat_store_dword v[0:1], v2 5859; GCN2-NEXT: s_endpgm 5860; 5861; GCN3-LABEL: atomic_load_f32_addr64: 5862; GCN3: ; %bb.0: ; %entry 5863; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5864; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5865; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5866; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 5867; GCN3-NEXT: s_add_u32 s0, s0, s4 5868; GCN3-NEXT: s_addc_u32 s1, s1, s5 5869; GCN3-NEXT: v_mov_b32_e32 v0, s0 5870; GCN3-NEXT: v_mov_b32_e32 v1, s1 5871; GCN3-NEXT: flat_load_dword v2, v[0:1] glc 5872; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5873; GCN3-NEXT: buffer_wbinvl1_vol 5874; GCN3-NEXT: v_mov_b32_e32 v0, s2 5875; GCN3-NEXT: v_mov_b32_e32 v1, s3 5876; GCN3-NEXT: flat_store_dword v[0:1], v2 5877; GCN3-NEXT: s_endpgm 5878entry: 5879 %ptr = getelementptr float, ptr %in, i64 %index 5880 %val = load atomic float, ptr %ptr seq_cst, align 4 5881 store float %val, ptr %out 5882 ret void 5883} 5884 5885define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { 5886; GCN1-LABEL: atomic_store_f32_offset: 5887; GCN1: ; %bb.0: ; %entry 5888; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 5889; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 5890; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5891; GCN1-NEXT: s_add_u32 s0, s0, 16 5892; GCN1-NEXT: s_addc_u32 s1, s1, 0 5893; GCN1-NEXT: v_mov_b32_e32 v0, s0 5894; GCN1-NEXT: v_mov_b32_e32 v1, s1 5895; GCN1-NEXT: v_mov_b32_e32 v2, s2 5896; GCN1-NEXT: flat_store_dword v[0:1], v2 5897; GCN1-NEXT: s_endpgm 5898; 5899; GCN2-LABEL: atomic_store_f32_offset: 5900; GCN2: ; %bb.0: ; %entry 5901; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5902; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 5903; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5904; GCN2-NEXT: s_add_u32 s0, s0, 16 5905; GCN2-NEXT: s_addc_u32 s1, s1, 0 5906; GCN2-NEXT: v_mov_b32_e32 v0, s0 5907; GCN2-NEXT: v_mov_b32_e32 v1, s1 5908; GCN2-NEXT: v_mov_b32_e32 v2, s2 5909; GCN2-NEXT: flat_store_dword v[0:1], v2 5910; GCN2-NEXT: s_endpgm 5911; 5912; GCN3-LABEL: atomic_store_f32_offset: 5913; GCN3: ; %bb.0: ; %entry 5914; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5915; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 5916; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5917; GCN3-NEXT: v_mov_b32_e32 v0, s0 5918; GCN3-NEXT: v_mov_b32_e32 v1, s1 5919; GCN3-NEXT: v_mov_b32_e32 v2, s2 5920; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 5921; GCN3-NEXT: s_endpgm 5922entry: 5923 %gep = getelementptr float, ptr %out, i32 4 5924 store atomic float %in, ptr %gep seq_cst, align 4 5925 ret void 5926} 5927 5928define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { 5929; GCN1-LABEL: atomic_store_f32: 5930; GCN1: ; %bb.0: ; %entry 5931; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 5932; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 5933; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5934; GCN1-NEXT: v_mov_b32_e32 v0, s0 5935; GCN1-NEXT: v_mov_b32_e32 v1, s1 5936; GCN1-NEXT: v_mov_b32_e32 v2, s2 5937; GCN1-NEXT: flat_store_dword v[0:1], v2 5938; GCN1-NEXT: s_endpgm 5939; 5940; GCN2-LABEL: atomic_store_f32: 5941; GCN2: ; %bb.0: ; %entry 5942; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5943; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 5944; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5945; GCN2-NEXT: v_mov_b32_e32 v0, s0 5946; GCN2-NEXT: v_mov_b32_e32 v1, s1 5947; GCN2-NEXT: v_mov_b32_e32 v2, s2 5948; GCN2-NEXT: flat_store_dword v[0:1], v2 5949; GCN2-NEXT: s_endpgm 5950; 5951; GCN3-LABEL: atomic_store_f32: 5952; GCN3: ; %bb.0: ; %entry 5953; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5954; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 5955; GCN3-NEXT: s_waitcnt lgkmcnt(0) 5956; GCN3-NEXT: v_mov_b32_e32 v0, s0 5957; GCN3-NEXT: v_mov_b32_e32 v1, s1 5958; GCN3-NEXT: v_mov_b32_e32 v2, s2 5959; GCN3-NEXT: flat_store_dword v[0:1], v2 5960; GCN3-NEXT: s_endpgm 5961entry: 5962 store atomic float %in, ptr %out seq_cst, align 4 5963 ret void 5964} 5965 5966define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { 5967; GCN1-LABEL: atomic_store_f32_addr64_offset: 5968; GCN1: ; %bb.0: ; %entry 5969; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 5970; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 5971; GCN1-NEXT: s_waitcnt lgkmcnt(0) 5972; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5973; GCN1-NEXT: s_add_u32 s0, s0, s2 5974; GCN1-NEXT: s_addc_u32 s1, s1, s3 5975; GCN1-NEXT: s_add_u32 s0, s0, 16 5976; GCN1-NEXT: s_addc_u32 s1, s1, 0 5977; GCN1-NEXT: v_mov_b32_e32 v0, s0 5978; GCN1-NEXT: v_mov_b32_e32 v1, s1 5979; GCN1-NEXT: v_mov_b32_e32 v2, s4 5980; GCN1-NEXT: flat_store_dword v[0:1], v2 5981; GCN1-NEXT: s_endpgm 5982; 5983; GCN2-LABEL: atomic_store_f32_addr64_offset: 5984; GCN2: ; %bb.0: ; %entry 5985; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 5986; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 5987; GCN2-NEXT: s_waitcnt lgkmcnt(0) 5988; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 5989; GCN2-NEXT: s_add_u32 s0, s0, s2 5990; GCN2-NEXT: s_addc_u32 s1, s1, s3 5991; GCN2-NEXT: s_add_u32 s0, s0, 16 5992; GCN2-NEXT: s_addc_u32 s1, s1, 0 5993; GCN2-NEXT: v_mov_b32_e32 v0, s0 5994; GCN2-NEXT: v_mov_b32_e32 v1, s1 5995; GCN2-NEXT: v_mov_b32_e32 v2, s4 5996; GCN2-NEXT: flat_store_dword v[0:1], v2 5997; GCN2-NEXT: s_endpgm 5998; 5999; GCN3-LABEL: atomic_store_f32_addr64_offset: 6000; GCN3: ; %bb.0: ; %entry 6001; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6002; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 6003; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6004; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 6005; GCN3-NEXT: s_add_u32 s0, s0, s2 6006; GCN3-NEXT: s_addc_u32 s1, s1, s3 6007; GCN3-NEXT: v_mov_b32_e32 v0, s0 6008; GCN3-NEXT: v_mov_b32_e32 v1, s1 6009; GCN3-NEXT: v_mov_b32_e32 v2, s6 6010; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 6011; GCN3-NEXT: s_endpgm 6012entry: 6013 %ptr = getelementptr float, ptr %out, i64 %index 6014 %gep = getelementptr float, ptr %ptr, i32 4 6015 store atomic float %in, ptr %gep seq_cst, align 4 6016 ret void 6017} 6018 6019define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { 6020; GCN1-LABEL: atomic_store_f32_addr64: 6021; GCN1: ; %bb.0: ; %entry 6022; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 6023; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 6024; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6025; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 6026; GCN1-NEXT: s_add_u32 s0, s0, s2 6027; GCN1-NEXT: s_addc_u32 s1, s1, s3 6028; GCN1-NEXT: v_mov_b32_e32 v0, s0 6029; GCN1-NEXT: v_mov_b32_e32 v1, s1 6030; GCN1-NEXT: v_mov_b32_e32 v2, s4 6031; GCN1-NEXT: flat_store_dword v[0:1], v2 6032; GCN1-NEXT: s_endpgm 6033; 6034; GCN2-LABEL: atomic_store_f32_addr64: 6035; GCN2: ; %bb.0: ; %entry 6036; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6037; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 6038; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6039; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 6040; GCN2-NEXT: s_add_u32 s0, s0, s2 6041; GCN2-NEXT: s_addc_u32 s1, s1, s3 6042; GCN2-NEXT: v_mov_b32_e32 v0, s0 6043; GCN2-NEXT: v_mov_b32_e32 v1, s1 6044; GCN2-NEXT: v_mov_b32_e32 v2, s4 6045; GCN2-NEXT: flat_store_dword v[0:1], v2 6046; GCN2-NEXT: s_endpgm 6047; 6048; GCN3-LABEL: atomic_store_f32_addr64: 6049; GCN3: ; %bb.0: ; %entry 6050; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6051; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 6052; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6053; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 6054; GCN3-NEXT: s_add_u32 s0, s0, s2 6055; GCN3-NEXT: s_addc_u32 s1, s1, s3 6056; GCN3-NEXT: v_mov_b32_e32 v0, s0 6057; GCN3-NEXT: v_mov_b32_e32 v1, s1 6058; GCN3-NEXT: v_mov_b32_e32 v2, s6 6059; GCN3-NEXT: flat_store_dword v[0:1], v2 6060; GCN3-NEXT: s_endpgm 6061entry: 6062 %ptr = getelementptr float, ptr %out, i64 %index 6063 store atomic float %in, ptr %ptr seq_cst, align 4 6064 ret void 6065} 6066 6067define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { 6068; GCN1-LABEL: atomic_load_i8_offset: 6069; GCN1: ; %bb.0: ; %entry 6070; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6071; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6072; GCN1-NEXT: s_add_u32 s0, s0, 16 6073; GCN1-NEXT: s_addc_u32 s1, s1, 0 6074; GCN1-NEXT: v_mov_b32_e32 v0, s0 6075; GCN1-NEXT: v_mov_b32_e32 v1, s1 6076; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc 6077; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6078; GCN1-NEXT: buffer_wbinvl1_vol 6079; GCN1-NEXT: v_mov_b32_e32 v0, s2 6080; GCN1-NEXT: v_mov_b32_e32 v1, s3 6081; GCN1-NEXT: flat_store_byte v[0:1], v2 6082; GCN1-NEXT: s_endpgm 6083; 6084; GCN2-LABEL: atomic_load_i8_offset: 6085; GCN2: ; %bb.0: ; %entry 6086; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6087; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6088; GCN2-NEXT: s_add_u32 s0, s0, 16 6089; GCN2-NEXT: s_addc_u32 s1, s1, 0 6090; GCN2-NEXT: v_mov_b32_e32 v0, s0 6091; GCN2-NEXT: v_mov_b32_e32 v1, s1 6092; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc 6093; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6094; GCN2-NEXT: buffer_wbinvl1_vol 6095; GCN2-NEXT: v_mov_b32_e32 v0, s2 6096; GCN2-NEXT: v_mov_b32_e32 v1, s3 6097; GCN2-NEXT: flat_store_byte v[0:1], v2 6098; GCN2-NEXT: s_endpgm 6099; 6100; GCN3-LABEL: atomic_load_i8_offset: 6101; GCN3: ; %bb.0: ; %entry 6102; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6103; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6104; GCN3-NEXT: v_mov_b32_e32 v0, s0 6105; GCN3-NEXT: v_mov_b32_e32 v1, s1 6106; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc 6107; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6108; GCN3-NEXT: buffer_wbinvl1_vol 6109; GCN3-NEXT: v_mov_b32_e32 v0, s2 6110; GCN3-NEXT: v_mov_b32_e32 v1, s3 6111; GCN3-NEXT: flat_store_byte v[0:1], v2 6112; GCN3-NEXT: s_endpgm 6113entry: 6114 %gep = getelementptr i8, ptr %in, i64 16 6115 %val = load atomic i8, ptr %gep seq_cst, align 1 6116 store i8 %val, ptr %out 6117 ret void 6118} 6119 6120define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { 6121; GCN1-LABEL: atomic_load_i8: 6122; GCN1: ; %bb.0: ; %entry 6123; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6124; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6125; GCN1-NEXT: v_mov_b32_e32 v0, s0 6126; GCN1-NEXT: v_mov_b32_e32 v1, s1 6127; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc 6128; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6129; GCN1-NEXT: buffer_wbinvl1_vol 6130; GCN1-NEXT: v_mov_b32_e32 v0, s2 6131; GCN1-NEXT: v_mov_b32_e32 v1, s3 6132; GCN1-NEXT: flat_store_byte v[0:1], v2 6133; GCN1-NEXT: s_endpgm 6134; 6135; GCN2-LABEL: atomic_load_i8: 6136; GCN2: ; %bb.0: ; %entry 6137; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6138; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6139; GCN2-NEXT: v_mov_b32_e32 v0, s0 6140; GCN2-NEXT: v_mov_b32_e32 v1, s1 6141; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc 6142; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6143; GCN2-NEXT: buffer_wbinvl1_vol 6144; GCN2-NEXT: v_mov_b32_e32 v0, s2 6145; GCN2-NEXT: v_mov_b32_e32 v1, s3 6146; GCN2-NEXT: flat_store_byte v[0:1], v2 6147; GCN2-NEXT: s_endpgm 6148; 6149; GCN3-LABEL: atomic_load_i8: 6150; GCN3: ; %bb.0: ; %entry 6151; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6152; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6153; GCN3-NEXT: v_mov_b32_e32 v0, s0 6154; GCN3-NEXT: v_mov_b32_e32 v1, s1 6155; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc 6156; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6157; GCN3-NEXT: buffer_wbinvl1_vol 6158; GCN3-NEXT: v_mov_b32_e32 v0, s2 6159; GCN3-NEXT: v_mov_b32_e32 v1, s3 6160; GCN3-NEXT: flat_store_byte v[0:1], v2 6161; GCN3-NEXT: s_endpgm 6162entry: 6163 %val = load atomic i8, ptr %in seq_cst, align 1 6164 store i8 %val, ptr %out 6165 ret void 6166} 6167 6168define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { 6169; GCN1-LABEL: atomic_load_i8_addr64_offset: 6170; GCN1: ; %bb.0: ; %entry 6171; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6172; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 6173; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6174; GCN1-NEXT: s_add_u32 s0, s0, s4 6175; GCN1-NEXT: s_addc_u32 s1, s1, s5 6176; GCN1-NEXT: s_add_u32 s0, s0, 16 6177; GCN1-NEXT: s_addc_u32 s1, s1, 0 6178; GCN1-NEXT: v_mov_b32_e32 v0, s0 6179; GCN1-NEXT: v_mov_b32_e32 v1, s1 6180; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc 6181; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6182; GCN1-NEXT: buffer_wbinvl1_vol 6183; GCN1-NEXT: v_mov_b32_e32 v0, s2 6184; GCN1-NEXT: v_mov_b32_e32 v1, s3 6185; GCN1-NEXT: flat_store_byte v[0:1], v2 6186; GCN1-NEXT: s_endpgm 6187; 6188; GCN2-LABEL: atomic_load_i8_addr64_offset: 6189; GCN2: ; %bb.0: ; %entry 6190; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6191; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 6192; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6193; GCN2-NEXT: s_add_u32 s0, s0, s4 6194; GCN2-NEXT: s_addc_u32 s1, s1, s5 6195; GCN2-NEXT: s_add_u32 s0, s0, 16 6196; GCN2-NEXT: s_addc_u32 s1, s1, 0 6197; GCN2-NEXT: v_mov_b32_e32 v0, s0 6198; GCN2-NEXT: v_mov_b32_e32 v1, s1 6199; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc 6200; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6201; GCN2-NEXT: buffer_wbinvl1_vol 6202; GCN2-NEXT: v_mov_b32_e32 v0, s2 6203; GCN2-NEXT: v_mov_b32_e32 v1, s3 6204; GCN2-NEXT: flat_store_byte v[0:1], v2 6205; GCN2-NEXT: s_endpgm 6206; 6207; GCN3-LABEL: atomic_load_i8_addr64_offset: 6208; GCN3: ; %bb.0: ; %entry 6209; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6210; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6211; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6212; GCN3-NEXT: s_add_u32 s0, s0, s6 6213; GCN3-NEXT: s_addc_u32 s1, s1, s7 6214; GCN3-NEXT: v_mov_b32_e32 v0, s0 6215; GCN3-NEXT: v_mov_b32_e32 v1, s1 6216; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc 6217; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6218; GCN3-NEXT: buffer_wbinvl1_vol 6219; GCN3-NEXT: v_mov_b32_e32 v0, s2 6220; GCN3-NEXT: v_mov_b32_e32 v1, s3 6221; GCN3-NEXT: flat_store_byte v[0:1], v2 6222; GCN3-NEXT: s_endpgm 6223entry: 6224 %ptr = getelementptr i8, ptr %in, i64 %index 6225 %gep = getelementptr i8, ptr %ptr, i64 16 6226 %val = load atomic i8, ptr %gep seq_cst, align 1 6227 store i8 %val, ptr %out 6228 ret void 6229} 6230 6231define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { 6232; GCN1-LABEL: atomic_store_i8_offset: 6233; GCN1: ; %bb.0: ; %entry 6234; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6235; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6236; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6237; GCN1-NEXT: s_add_u32 s0, s0, 16 6238; GCN1-NEXT: s_addc_u32 s1, s1, 0 6239; GCN1-NEXT: v_mov_b32_e32 v0, s0 6240; GCN1-NEXT: v_mov_b32_e32 v1, s1 6241; GCN1-NEXT: v_mov_b32_e32 v2, s2 6242; GCN1-NEXT: flat_store_byte v[0:1], v2 6243; GCN1-NEXT: s_endpgm 6244; 6245; GCN2-LABEL: atomic_store_i8_offset: 6246; GCN2: ; %bb.0: ; %entry 6247; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6248; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6249; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6250; GCN2-NEXT: s_add_u32 s0, s0, 16 6251; GCN2-NEXT: s_addc_u32 s1, s1, 0 6252; GCN2-NEXT: v_mov_b32_e32 v0, s0 6253; GCN2-NEXT: v_mov_b32_e32 v1, s1 6254; GCN2-NEXT: v_mov_b32_e32 v2, s2 6255; GCN2-NEXT: flat_store_byte v[0:1], v2 6256; GCN2-NEXT: s_endpgm 6257; 6258; GCN3-LABEL: atomic_store_i8_offset: 6259; GCN3: ; %bb.0: ; %entry 6260; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6261; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6262; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6263; GCN3-NEXT: v_mov_b32_e32 v0, s0 6264; GCN3-NEXT: v_mov_b32_e32 v1, s1 6265; GCN3-NEXT: v_mov_b32_e32 v2, s2 6266; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 6267; GCN3-NEXT: s_endpgm 6268entry: 6269 %gep = getelementptr i8, ptr %out, i64 16 6270 store atomic i8 %in, ptr %gep seq_cst, align 1 6271 ret void 6272} 6273 6274define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { 6275; GCN1-LABEL: atomic_store_i8: 6276; GCN1: ; %bb.0: ; %entry 6277; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6278; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6279; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6280; GCN1-NEXT: v_mov_b32_e32 v0, s0 6281; GCN1-NEXT: v_mov_b32_e32 v1, s1 6282; GCN1-NEXT: v_mov_b32_e32 v2, s2 6283; GCN1-NEXT: flat_store_byte v[0:1], v2 6284; GCN1-NEXT: s_endpgm 6285; 6286; GCN2-LABEL: atomic_store_i8: 6287; GCN2: ; %bb.0: ; %entry 6288; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6289; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6290; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6291; GCN2-NEXT: v_mov_b32_e32 v0, s0 6292; GCN2-NEXT: v_mov_b32_e32 v1, s1 6293; GCN2-NEXT: v_mov_b32_e32 v2, s2 6294; GCN2-NEXT: flat_store_byte v[0:1], v2 6295; GCN2-NEXT: s_endpgm 6296; 6297; GCN3-LABEL: atomic_store_i8: 6298; GCN3: ; %bb.0: ; %entry 6299; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6300; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6301; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6302; GCN3-NEXT: v_mov_b32_e32 v0, s0 6303; GCN3-NEXT: v_mov_b32_e32 v1, s1 6304; GCN3-NEXT: v_mov_b32_e32 v2, s2 6305; GCN3-NEXT: flat_store_byte v[0:1], v2 6306; GCN3-NEXT: s_endpgm 6307entry: 6308 store atomic i8 %in, ptr %out seq_cst, align 1 6309 ret void 6310} 6311 6312define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { 6313; GCN1-LABEL: atomic_store_i8_addr64_offset: 6314; GCN1: ; %bb.0: ; %entry 6315; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 6316; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 6317; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6318; GCN1-NEXT: s_add_u32 s0, s0, s2 6319; GCN1-NEXT: s_addc_u32 s1, s1, s3 6320; GCN1-NEXT: s_add_u32 s0, s0, 16 6321; GCN1-NEXT: s_addc_u32 s1, s1, 0 6322; GCN1-NEXT: v_mov_b32_e32 v0, s0 6323; GCN1-NEXT: v_mov_b32_e32 v1, s1 6324; GCN1-NEXT: v_mov_b32_e32 v2, s4 6325; GCN1-NEXT: flat_store_byte v[0:1], v2 6326; GCN1-NEXT: s_endpgm 6327; 6328; GCN2-LABEL: atomic_store_i8_addr64_offset: 6329; GCN2: ; %bb.0: ; %entry 6330; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6331; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 6332; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6333; GCN2-NEXT: s_add_u32 s0, s0, s2 6334; GCN2-NEXT: s_addc_u32 s1, s1, s3 6335; GCN2-NEXT: s_add_u32 s0, s0, 16 6336; GCN2-NEXT: s_addc_u32 s1, s1, 0 6337; GCN2-NEXT: v_mov_b32_e32 v0, s0 6338; GCN2-NEXT: v_mov_b32_e32 v1, s1 6339; GCN2-NEXT: v_mov_b32_e32 v2, s4 6340; GCN2-NEXT: flat_store_byte v[0:1], v2 6341; GCN2-NEXT: s_endpgm 6342; 6343; GCN3-LABEL: atomic_store_i8_addr64_offset: 6344; GCN3: ; %bb.0: ; %entry 6345; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6346; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 6347; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6348; GCN3-NEXT: s_add_u32 s0, s0, s2 6349; GCN3-NEXT: s_addc_u32 s1, s1, s3 6350; GCN3-NEXT: v_mov_b32_e32 v0, s0 6351; GCN3-NEXT: v_mov_b32_e32 v1, s1 6352; GCN3-NEXT: v_mov_b32_e32 v2, s6 6353; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 6354; GCN3-NEXT: s_endpgm 6355entry: 6356 %ptr = getelementptr i8, ptr %out, i64 %index 6357 %gep = getelementptr i8, ptr %ptr, i64 16 6358 store atomic i8 %in, ptr %gep seq_cst, align 1 6359 ret void 6360} 6361 6362define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { 6363; GCN1-LABEL: atomic_load_i16_offset: 6364; GCN1: ; %bb.0: ; %entry 6365; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6366; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6367; GCN1-NEXT: s_add_u32 s0, s0, 16 6368; GCN1-NEXT: s_addc_u32 s1, s1, 0 6369; GCN1-NEXT: v_mov_b32_e32 v0, s0 6370; GCN1-NEXT: v_mov_b32_e32 v1, s1 6371; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 6372; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6373; GCN1-NEXT: buffer_wbinvl1_vol 6374; GCN1-NEXT: v_mov_b32_e32 v0, s2 6375; GCN1-NEXT: v_mov_b32_e32 v1, s3 6376; GCN1-NEXT: flat_store_short v[0:1], v2 6377; GCN1-NEXT: s_endpgm 6378; 6379; GCN2-LABEL: atomic_load_i16_offset: 6380; GCN2: ; %bb.0: ; %entry 6381; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6382; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6383; GCN2-NEXT: s_add_u32 s0, s0, 16 6384; GCN2-NEXT: s_addc_u32 s1, s1, 0 6385; GCN2-NEXT: v_mov_b32_e32 v0, s0 6386; GCN2-NEXT: v_mov_b32_e32 v1, s1 6387; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 6388; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6389; GCN2-NEXT: buffer_wbinvl1_vol 6390; GCN2-NEXT: v_mov_b32_e32 v0, s2 6391; GCN2-NEXT: v_mov_b32_e32 v1, s3 6392; GCN2-NEXT: flat_store_short v[0:1], v2 6393; GCN2-NEXT: s_endpgm 6394; 6395; GCN3-LABEL: atomic_load_i16_offset: 6396; GCN3: ; %bb.0: ; %entry 6397; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6398; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6399; GCN3-NEXT: v_mov_b32_e32 v0, s0 6400; GCN3-NEXT: v_mov_b32_e32 v1, s1 6401; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc 6402; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6403; GCN3-NEXT: buffer_wbinvl1_vol 6404; GCN3-NEXT: v_mov_b32_e32 v0, s2 6405; GCN3-NEXT: v_mov_b32_e32 v1, s3 6406; GCN3-NEXT: flat_store_short v[0:1], v2 6407; GCN3-NEXT: s_endpgm 6408entry: 6409 %gep = getelementptr i16, ptr %in, i64 8 6410 %val = load atomic i16, ptr %gep seq_cst, align 2 6411 store i16 %val, ptr %out 6412 ret void 6413} 6414 6415define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { 6416; GCN1-LABEL: atomic_load_i16: 6417; GCN1: ; %bb.0: ; %entry 6418; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6419; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6420; GCN1-NEXT: v_mov_b32_e32 v0, s0 6421; GCN1-NEXT: v_mov_b32_e32 v1, s1 6422; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 6423; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6424; GCN1-NEXT: buffer_wbinvl1_vol 6425; GCN1-NEXT: v_mov_b32_e32 v0, s2 6426; GCN1-NEXT: v_mov_b32_e32 v1, s3 6427; GCN1-NEXT: flat_store_short v[0:1], v2 6428; GCN1-NEXT: s_endpgm 6429; 6430; GCN2-LABEL: atomic_load_i16: 6431; GCN2: ; %bb.0: ; %entry 6432; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6433; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6434; GCN2-NEXT: v_mov_b32_e32 v0, s0 6435; GCN2-NEXT: v_mov_b32_e32 v1, s1 6436; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 6437; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6438; GCN2-NEXT: buffer_wbinvl1_vol 6439; GCN2-NEXT: v_mov_b32_e32 v0, s2 6440; GCN2-NEXT: v_mov_b32_e32 v1, s3 6441; GCN2-NEXT: flat_store_short v[0:1], v2 6442; GCN2-NEXT: s_endpgm 6443; 6444; GCN3-LABEL: atomic_load_i16: 6445; GCN3: ; %bb.0: ; %entry 6446; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6447; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6448; GCN3-NEXT: v_mov_b32_e32 v0, s0 6449; GCN3-NEXT: v_mov_b32_e32 v1, s1 6450; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc 6451; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6452; GCN3-NEXT: buffer_wbinvl1_vol 6453; GCN3-NEXT: v_mov_b32_e32 v0, s2 6454; GCN3-NEXT: v_mov_b32_e32 v1, s3 6455; GCN3-NEXT: flat_store_short v[0:1], v2 6456; GCN3-NEXT: s_endpgm 6457entry: 6458 %val = load atomic i16, ptr %in seq_cst, align 2 6459 store i16 %val, ptr %out 6460 ret void 6461} 6462 6463define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { 6464; GCN1-LABEL: atomic_load_i16_addr64_offset: 6465; GCN1: ; %bb.0: ; %entry 6466; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 6467; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6468; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6469; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 6470; GCN1-NEXT: s_add_u32 s0, s0, s4 6471; GCN1-NEXT: s_addc_u32 s1, s1, s5 6472; GCN1-NEXT: s_add_u32 s0, s0, 16 6473; GCN1-NEXT: s_addc_u32 s1, s1, 0 6474; GCN1-NEXT: v_mov_b32_e32 v0, s0 6475; GCN1-NEXT: v_mov_b32_e32 v1, s1 6476; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 6477; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6478; GCN1-NEXT: buffer_wbinvl1_vol 6479; GCN1-NEXT: v_mov_b32_e32 v0, s2 6480; GCN1-NEXT: v_mov_b32_e32 v1, s3 6481; GCN1-NEXT: flat_store_short v[0:1], v2 6482; GCN1-NEXT: s_endpgm 6483; 6484; GCN2-LABEL: atomic_load_i16_addr64_offset: 6485; GCN2: ; %bb.0: ; %entry 6486; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6487; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6488; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6489; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 6490; GCN2-NEXT: s_add_u32 s0, s0, s4 6491; GCN2-NEXT: s_addc_u32 s1, s1, s5 6492; GCN2-NEXT: s_add_u32 s0, s0, 16 6493; GCN2-NEXT: s_addc_u32 s1, s1, 0 6494; GCN2-NEXT: v_mov_b32_e32 v0, s0 6495; GCN2-NEXT: v_mov_b32_e32 v1, s1 6496; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 6497; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6498; GCN2-NEXT: buffer_wbinvl1_vol 6499; GCN2-NEXT: v_mov_b32_e32 v0, s2 6500; GCN2-NEXT: v_mov_b32_e32 v1, s3 6501; GCN2-NEXT: flat_store_short v[0:1], v2 6502; GCN2-NEXT: s_endpgm 6503; 6504; GCN3-LABEL: atomic_load_i16_addr64_offset: 6505; GCN3: ; %bb.0: ; %entry 6506; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6507; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6508; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6509; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 6510; GCN3-NEXT: s_add_u32 s0, s0, s4 6511; GCN3-NEXT: s_addc_u32 s1, s1, s5 6512; GCN3-NEXT: v_mov_b32_e32 v0, s0 6513; GCN3-NEXT: v_mov_b32_e32 v1, s1 6514; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc 6515; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6516; GCN3-NEXT: buffer_wbinvl1_vol 6517; GCN3-NEXT: v_mov_b32_e32 v0, s2 6518; GCN3-NEXT: v_mov_b32_e32 v1, s3 6519; GCN3-NEXT: flat_store_short v[0:1], v2 6520; GCN3-NEXT: s_endpgm 6521entry: 6522 %ptr = getelementptr i16, ptr %in, i64 %index 6523 %gep = getelementptr i16, ptr %ptr, i64 8 6524 %val = load atomic i16, ptr %gep seq_cst, align 2 6525 store i16 %val, ptr %out 6526 ret void 6527} 6528 6529define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { 6530; GCN1-LABEL: atomic_store_i16_offset: 6531; GCN1: ; %bb.0: ; %entry 6532; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6533; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6534; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6535; GCN1-NEXT: s_add_u32 s0, s0, 16 6536; GCN1-NEXT: s_addc_u32 s1, s1, 0 6537; GCN1-NEXT: v_mov_b32_e32 v0, s0 6538; GCN1-NEXT: v_mov_b32_e32 v1, s1 6539; GCN1-NEXT: v_mov_b32_e32 v2, s2 6540; GCN1-NEXT: flat_store_short v[0:1], v2 6541; GCN1-NEXT: s_endpgm 6542; 6543; GCN2-LABEL: atomic_store_i16_offset: 6544; GCN2: ; %bb.0: ; %entry 6545; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6546; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6547; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6548; GCN2-NEXT: s_add_u32 s0, s0, 16 6549; GCN2-NEXT: s_addc_u32 s1, s1, 0 6550; GCN2-NEXT: v_mov_b32_e32 v0, s0 6551; GCN2-NEXT: v_mov_b32_e32 v1, s1 6552; GCN2-NEXT: v_mov_b32_e32 v2, s2 6553; GCN2-NEXT: flat_store_short v[0:1], v2 6554; GCN2-NEXT: s_endpgm 6555; 6556; GCN3-LABEL: atomic_store_i16_offset: 6557; GCN3: ; %bb.0: ; %entry 6558; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6559; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6560; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6561; GCN3-NEXT: v_mov_b32_e32 v0, s0 6562; GCN3-NEXT: v_mov_b32_e32 v1, s1 6563; GCN3-NEXT: v_mov_b32_e32 v2, s2 6564; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 6565; GCN3-NEXT: s_endpgm 6566entry: 6567 %gep = getelementptr i16, ptr %out, i64 8 6568 store atomic i16 %in, ptr %gep seq_cst, align 2 6569 ret void 6570} 6571 6572define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { 6573; GCN1-LABEL: atomic_store_i16: 6574; GCN1: ; %bb.0: ; %entry 6575; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6576; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6577; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6578; GCN1-NEXT: v_mov_b32_e32 v0, s0 6579; GCN1-NEXT: v_mov_b32_e32 v1, s1 6580; GCN1-NEXT: v_mov_b32_e32 v2, s2 6581; GCN1-NEXT: flat_store_short v[0:1], v2 6582; GCN1-NEXT: s_endpgm 6583; 6584; GCN2-LABEL: atomic_store_i16: 6585; GCN2: ; %bb.0: ; %entry 6586; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6587; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6588; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6589; GCN2-NEXT: v_mov_b32_e32 v0, s0 6590; GCN2-NEXT: v_mov_b32_e32 v1, s1 6591; GCN2-NEXT: v_mov_b32_e32 v2, s2 6592; GCN2-NEXT: flat_store_short v[0:1], v2 6593; GCN2-NEXT: s_endpgm 6594; 6595; GCN3-LABEL: atomic_store_i16: 6596; GCN3: ; %bb.0: ; %entry 6597; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6598; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6599; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6600; GCN3-NEXT: v_mov_b32_e32 v0, s0 6601; GCN3-NEXT: v_mov_b32_e32 v1, s1 6602; GCN3-NEXT: v_mov_b32_e32 v2, s2 6603; GCN3-NEXT: flat_store_short v[0:1], v2 6604; GCN3-NEXT: s_endpgm 6605entry: 6606 store atomic i16 %in, ptr %out seq_cst, align 2 6607 ret void 6608} 6609 6610define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { 6611; GCN1-LABEL: atomic_store_i16_addr64_offset: 6612; GCN1: ; %bb.0: ; %entry 6613; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 6614; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 6615; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6616; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 6617; GCN1-NEXT: s_add_u32 s0, s0, s2 6618; GCN1-NEXT: s_addc_u32 s1, s1, s3 6619; GCN1-NEXT: s_add_u32 s0, s0, 16 6620; GCN1-NEXT: s_addc_u32 s1, s1, 0 6621; GCN1-NEXT: v_mov_b32_e32 v0, s0 6622; GCN1-NEXT: v_mov_b32_e32 v1, s1 6623; GCN1-NEXT: v_mov_b32_e32 v2, s4 6624; GCN1-NEXT: flat_store_short v[0:1], v2 6625; GCN1-NEXT: s_endpgm 6626; 6627; GCN2-LABEL: atomic_store_i16_addr64_offset: 6628; GCN2: ; %bb.0: ; %entry 6629; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6630; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 6631; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6632; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 6633; GCN2-NEXT: s_add_u32 s0, s0, s2 6634; GCN2-NEXT: s_addc_u32 s1, s1, s3 6635; GCN2-NEXT: s_add_u32 s0, s0, 16 6636; GCN2-NEXT: s_addc_u32 s1, s1, 0 6637; GCN2-NEXT: v_mov_b32_e32 v0, s0 6638; GCN2-NEXT: v_mov_b32_e32 v1, s1 6639; GCN2-NEXT: v_mov_b32_e32 v2, s4 6640; GCN2-NEXT: flat_store_short v[0:1], v2 6641; GCN2-NEXT: s_endpgm 6642; 6643; GCN3-LABEL: atomic_store_i16_addr64_offset: 6644; GCN3: ; %bb.0: ; %entry 6645; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6646; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 6647; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6648; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 6649; GCN3-NEXT: s_add_u32 s0, s0, s2 6650; GCN3-NEXT: s_addc_u32 s1, s1, s3 6651; GCN3-NEXT: v_mov_b32_e32 v0, s0 6652; GCN3-NEXT: v_mov_b32_e32 v1, s1 6653; GCN3-NEXT: v_mov_b32_e32 v2, s6 6654; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 6655; GCN3-NEXT: s_endpgm 6656entry: 6657 %ptr = getelementptr i16, ptr %out, i64 %index 6658 %gep = getelementptr i16, ptr %ptr, i64 8 6659 store atomic i16 %in, ptr %gep seq_cst, align 2 6660 ret void 6661} 6662 6663define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { 6664; GCN1-LABEL: atomic_store_f16_offset: 6665; GCN1: ; %bb.0: ; %entry 6666; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6667; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6668; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6669; GCN1-NEXT: s_add_u32 s0, s0, 16 6670; GCN1-NEXT: s_addc_u32 s1, s1, 0 6671; GCN1-NEXT: v_mov_b32_e32 v0, s0 6672; GCN1-NEXT: v_mov_b32_e32 v1, s1 6673; GCN1-NEXT: v_mov_b32_e32 v2, s2 6674; GCN1-NEXT: flat_store_short v[0:1], v2 6675; GCN1-NEXT: s_endpgm 6676; 6677; GCN2-LABEL: atomic_store_f16_offset: 6678; GCN2: ; %bb.0: ; %entry 6679; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6680; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6681; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6682; GCN2-NEXT: s_add_u32 s0, s0, 16 6683; GCN2-NEXT: s_addc_u32 s1, s1, 0 6684; GCN2-NEXT: v_mov_b32_e32 v0, s0 6685; GCN2-NEXT: v_mov_b32_e32 v1, s1 6686; GCN2-NEXT: v_mov_b32_e32 v2, s2 6687; GCN2-NEXT: flat_store_short v[0:1], v2 6688; GCN2-NEXT: s_endpgm 6689; 6690; GCN3-LABEL: atomic_store_f16_offset: 6691; GCN3: ; %bb.0: ; %entry 6692; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6693; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6694; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6695; GCN3-NEXT: v_mov_b32_e32 v0, s0 6696; GCN3-NEXT: v_mov_b32_e32 v1, s1 6697; GCN3-NEXT: v_mov_b32_e32 v2, s2 6698; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 6699; GCN3-NEXT: s_endpgm 6700entry: 6701 %gep = getelementptr half, ptr %out, i64 8 6702 store atomic half %in, ptr %gep seq_cst, align 2 6703 ret void 6704} 6705 6706define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { 6707; GCN1-LABEL: atomic_store_f16: 6708; GCN1: ; %bb.0: ; %entry 6709; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6710; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6711; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6712; GCN1-NEXT: v_mov_b32_e32 v0, s0 6713; GCN1-NEXT: v_mov_b32_e32 v1, s1 6714; GCN1-NEXT: v_mov_b32_e32 v2, s2 6715; GCN1-NEXT: flat_store_short v[0:1], v2 6716; GCN1-NEXT: s_endpgm 6717; 6718; GCN2-LABEL: atomic_store_f16: 6719; GCN2: ; %bb.0: ; %entry 6720; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6721; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6722; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6723; GCN2-NEXT: v_mov_b32_e32 v0, s0 6724; GCN2-NEXT: v_mov_b32_e32 v1, s1 6725; GCN2-NEXT: v_mov_b32_e32 v2, s2 6726; GCN2-NEXT: flat_store_short v[0:1], v2 6727; GCN2-NEXT: s_endpgm 6728; 6729; GCN3-LABEL: atomic_store_f16: 6730; GCN3: ; %bb.0: ; %entry 6731; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6732; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6733; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6734; GCN3-NEXT: v_mov_b32_e32 v0, s0 6735; GCN3-NEXT: v_mov_b32_e32 v1, s1 6736; GCN3-NEXT: v_mov_b32_e32 v2, s2 6737; GCN3-NEXT: flat_store_short v[0:1], v2 6738; GCN3-NEXT: s_endpgm 6739entry: 6740 store atomic half %in, ptr %out seq_cst, align 2 6741 ret void 6742} 6743 6744define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { 6745; GCN1-LABEL: atomic_store_bf16_offset: 6746; GCN1: ; %bb.0: 6747; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6748; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6749; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6750; GCN1-NEXT: v_mov_b32_e32 v0, s0 6751; GCN1-NEXT: v_mov_b32_e32 v1, s1 6752; GCN1-NEXT: v_mov_b32_e32 v2, s2 6753; GCN1-NEXT: flat_store_short v[0:1], v2 6754; GCN1-NEXT: s_endpgm 6755; 6756; GCN2-LABEL: atomic_store_bf16_offset: 6757; GCN2: ; %bb.0: 6758; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6759; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6760; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6761; GCN2-NEXT: v_mov_b32_e32 v0, s0 6762; GCN2-NEXT: v_mov_b32_e32 v1, s1 6763; GCN2-NEXT: v_mov_b32_e32 v2, s2 6764; GCN2-NEXT: flat_store_short v[0:1], v2 6765; GCN2-NEXT: s_endpgm 6766; 6767; GCN3-LABEL: atomic_store_bf16_offset: 6768; GCN3: ; %bb.0: 6769; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6770; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6771; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6772; GCN3-NEXT: v_mov_b32_e32 v0, s0 6773; GCN3-NEXT: v_mov_b32_e32 v1, s1 6774; GCN3-NEXT: v_mov_b32_e32 v2, s2 6775; GCN3-NEXT: flat_store_short v[0:1], v2 6776; GCN3-NEXT: s_endpgm 6777 %gep = getelementptr bfloat, ptr %out, i64 8 6778 store atomic bfloat %in, ptr %out seq_cst, align 2 6779 ret void 6780} 6781 6782define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { 6783; GCN1-LABEL: atomic_store_bf16: 6784; GCN1: ; %bb.0: 6785; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 6786; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 6787; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6788; GCN1-NEXT: v_mov_b32_e32 v0, s0 6789; GCN1-NEXT: v_mov_b32_e32 v1, s1 6790; GCN1-NEXT: v_mov_b32_e32 v2, s2 6791; GCN1-NEXT: flat_store_short v[0:1], v2 6792; GCN1-NEXT: s_endpgm 6793; 6794; GCN2-LABEL: atomic_store_bf16: 6795; GCN2: ; %bb.0: 6796; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6797; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 6798; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6799; GCN2-NEXT: v_mov_b32_e32 v0, s0 6800; GCN2-NEXT: v_mov_b32_e32 v1, s1 6801; GCN2-NEXT: v_mov_b32_e32 v2, s2 6802; GCN2-NEXT: flat_store_short v[0:1], v2 6803; GCN2-NEXT: s_endpgm 6804; 6805; GCN3-LABEL: atomic_store_bf16: 6806; GCN3: ; %bb.0: 6807; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 6808; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 6809; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6810; GCN3-NEXT: v_mov_b32_e32 v0, s0 6811; GCN3-NEXT: v_mov_b32_e32 v1, s1 6812; GCN3-NEXT: v_mov_b32_e32 v2, s2 6813; GCN3-NEXT: flat_store_short v[0:1], v2 6814; GCN3-NEXT: s_endpgm 6815 store atomic bfloat %in, ptr %out seq_cst, align 2 6816 ret void 6817} 6818 6819define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { 6820; GCN1-LABEL: atomic_inc_i32_offset: 6821; GCN1: ; %bb.0: ; %entry 6822; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6823; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 6824; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6825; GCN1-NEXT: s_add_u32 s0, s0, 16 6826; GCN1-NEXT: s_addc_u32 s1, s1, 0 6827; GCN1-NEXT: v_mov_b32_e32 v0, s0 6828; GCN1-NEXT: v_mov_b32_e32 v1, s1 6829; GCN1-NEXT: v_mov_b32_e32 v2, s2 6830; GCN1-NEXT: flat_atomic_inc v[0:1], v2 6831; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6832; GCN1-NEXT: buffer_wbinvl1_vol 6833; GCN1-NEXT: s_endpgm 6834; 6835; GCN2-LABEL: atomic_inc_i32_offset: 6836; GCN2: ; %bb.0: ; %entry 6837; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6838; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 6839; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6840; GCN2-NEXT: s_add_u32 s0, s0, 16 6841; GCN2-NEXT: s_addc_u32 s1, s1, 0 6842; GCN2-NEXT: v_mov_b32_e32 v0, s0 6843; GCN2-NEXT: v_mov_b32_e32 v1, s1 6844; GCN2-NEXT: v_mov_b32_e32 v2, s2 6845; GCN2-NEXT: flat_atomic_inc v[0:1], v2 6846; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6847; GCN2-NEXT: buffer_wbinvl1_vol 6848; GCN2-NEXT: s_endpgm 6849; 6850; GCN3-LABEL: atomic_inc_i32_offset: 6851; GCN3: ; %bb.0: ; %entry 6852; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6853; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 6854; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6855; GCN3-NEXT: v_mov_b32_e32 v0, s0 6856; GCN3-NEXT: v_mov_b32_e32 v1, s1 6857; GCN3-NEXT: v_mov_b32_e32 v2, s2 6858; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 6859; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6860; GCN3-NEXT: buffer_wbinvl1_vol 6861; GCN3-NEXT: s_endpgm 6862entry: 6863 %gep = getelementptr i32, ptr %out, i32 4 6864 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 6865 ret void 6866} 6867 6868define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { 6869; GCN1-LABEL: atomic_inc_i32_max_offset: 6870; GCN1: ; %bb.0: ; %entry 6871; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6872; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 6873; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6874; GCN1-NEXT: s_add_u32 s0, s0, 0xffc 6875; GCN1-NEXT: s_addc_u32 s1, s1, 0 6876; GCN1-NEXT: v_mov_b32_e32 v0, s0 6877; GCN1-NEXT: v_mov_b32_e32 v1, s1 6878; GCN1-NEXT: v_mov_b32_e32 v2, s2 6879; GCN1-NEXT: flat_atomic_inc v[0:1], v2 6880; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6881; GCN1-NEXT: buffer_wbinvl1_vol 6882; GCN1-NEXT: s_endpgm 6883; 6884; GCN2-LABEL: atomic_inc_i32_max_offset: 6885; GCN2: ; %bb.0: ; %entry 6886; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6887; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 6888; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6889; GCN2-NEXT: s_add_u32 s0, s0, 0xffc 6890; GCN2-NEXT: s_addc_u32 s1, s1, 0 6891; GCN2-NEXT: v_mov_b32_e32 v0, s0 6892; GCN2-NEXT: v_mov_b32_e32 v1, s1 6893; GCN2-NEXT: v_mov_b32_e32 v2, s2 6894; GCN2-NEXT: flat_atomic_inc v[0:1], v2 6895; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6896; GCN2-NEXT: buffer_wbinvl1_vol 6897; GCN2-NEXT: s_endpgm 6898; 6899; GCN3-LABEL: atomic_inc_i32_max_offset: 6900; GCN3: ; %bb.0: ; %entry 6901; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6902; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 6903; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6904; GCN3-NEXT: v_mov_b32_e32 v0, s0 6905; GCN3-NEXT: v_mov_b32_e32 v1, s1 6906; GCN3-NEXT: v_mov_b32_e32 v2, s2 6907; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 6908; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6909; GCN3-NEXT: buffer_wbinvl1_vol 6910; GCN3-NEXT: s_endpgm 6911entry: 6912 %gep = getelementptr i32, ptr %out, i32 1023 6913 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 6914 ret void 6915} 6916 6917define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { 6918; GCN1-LABEL: atomic_inc_i32_max_offset_p1: 6919; GCN1: ; %bb.0: ; %entry 6920; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6921; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 6922; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6923; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 6924; GCN1-NEXT: s_addc_u32 s1, s1, 0 6925; GCN1-NEXT: v_mov_b32_e32 v0, s0 6926; GCN1-NEXT: v_mov_b32_e32 v1, s1 6927; GCN1-NEXT: v_mov_b32_e32 v2, s2 6928; GCN1-NEXT: flat_atomic_inc v[0:1], v2 6929; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6930; GCN1-NEXT: buffer_wbinvl1_vol 6931; GCN1-NEXT: s_endpgm 6932; 6933; GCN2-LABEL: atomic_inc_i32_max_offset_p1: 6934; GCN2: ; %bb.0: ; %entry 6935; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6936; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 6937; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6938; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 6939; GCN2-NEXT: s_addc_u32 s1, s1, 0 6940; GCN2-NEXT: v_mov_b32_e32 v0, s0 6941; GCN2-NEXT: v_mov_b32_e32 v1, s1 6942; GCN2-NEXT: v_mov_b32_e32 v2, s2 6943; GCN2-NEXT: flat_atomic_inc v[0:1], v2 6944; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6945; GCN2-NEXT: buffer_wbinvl1_vol 6946; GCN2-NEXT: s_endpgm 6947; 6948; GCN3-LABEL: atomic_inc_i32_max_offset_p1: 6949; GCN3: ; %bb.0: ; %entry 6950; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6951; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 6952; GCN3-NEXT: s_waitcnt lgkmcnt(0) 6953; GCN3-NEXT: v_mov_b32_e32 v0, s0 6954; GCN3-NEXT: v_mov_b32_e32 v1, s1 6955; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 6956; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6957; GCN3-NEXT: v_mov_b32_e32 v2, s2 6958; GCN3-NEXT: flat_atomic_inc v[0:1], v2 6959; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6960; GCN3-NEXT: buffer_wbinvl1_vol 6961; GCN3-NEXT: s_endpgm 6962entry: 6963 %gep = getelementptr i32, ptr %out, i32 1024 6964 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 6965 ret void 6966} 6967 6968define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 6969; GCN1-LABEL: atomic_inc_i32_ret_offset: 6970; GCN1: ; %bb.0: ; %entry 6971; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6972; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 6973; GCN1-NEXT: s_waitcnt lgkmcnt(0) 6974; GCN1-NEXT: s_add_u32 s0, s0, 16 6975; GCN1-NEXT: s_addc_u32 s1, s1, 0 6976; GCN1-NEXT: v_mov_b32_e32 v0, s0 6977; GCN1-NEXT: v_mov_b32_e32 v1, s1 6978; GCN1-NEXT: v_mov_b32_e32 v2, s4 6979; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 6980; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6981; GCN1-NEXT: buffer_wbinvl1_vol 6982; GCN1-NEXT: v_mov_b32_e32 v0, s2 6983; GCN1-NEXT: v_mov_b32_e32 v1, s3 6984; GCN1-NEXT: flat_store_dword v[0:1], v2 6985; GCN1-NEXT: s_endpgm 6986; 6987; GCN2-LABEL: atomic_inc_i32_ret_offset: 6988; GCN2: ; %bb.0: ; %entry 6989; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6990; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 6991; GCN2-NEXT: s_waitcnt lgkmcnt(0) 6992; GCN2-NEXT: s_add_u32 s0, s0, 16 6993; GCN2-NEXT: s_addc_u32 s1, s1, 0 6994; GCN2-NEXT: v_mov_b32_e32 v0, s0 6995; GCN2-NEXT: v_mov_b32_e32 v1, s1 6996; GCN2-NEXT: v_mov_b32_e32 v2, s4 6997; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 6998; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6999; GCN2-NEXT: buffer_wbinvl1_vol 7000; GCN2-NEXT: v_mov_b32_e32 v0, s2 7001; GCN2-NEXT: v_mov_b32_e32 v1, s3 7002; GCN2-NEXT: flat_store_dword v[0:1], v2 7003; GCN2-NEXT: s_endpgm 7004; 7005; GCN3-LABEL: atomic_inc_i32_ret_offset: 7006; GCN3: ; %bb.0: ; %entry 7007; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7008; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 7009; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7010; GCN3-NEXT: v_mov_b32_e32 v0, s0 7011; GCN3-NEXT: v_mov_b32_e32 v1, s1 7012; GCN3-NEXT: v_mov_b32_e32 v2, s6 7013; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc 7014; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7015; GCN3-NEXT: buffer_wbinvl1_vol 7016; GCN3-NEXT: v_mov_b32_e32 v0, s2 7017; GCN3-NEXT: v_mov_b32_e32 v1, s3 7018; GCN3-NEXT: flat_store_dword v[0:1], v2 7019; GCN3-NEXT: s_endpgm 7020entry: 7021 %gep = getelementptr i32, ptr %out, i32 4 7022 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7023 store i32 %val, ptr %out2 7024 ret void 7025} 7026 7027define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { 7028; GCN1-LABEL: atomic_inc_i32_incr64_offset: 7029; GCN1: ; %bb.0: ; %entry 7030; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 7031; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 7032; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 7033; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7034; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7035; GCN1-NEXT: s_add_u32 s0, s2, s0 7036; GCN1-NEXT: s_addc_u32 s1, s3, s1 7037; GCN1-NEXT: s_add_u32 s0, s0, 16 7038; GCN1-NEXT: s_addc_u32 s1, s1, 0 7039; GCN1-NEXT: v_mov_b32_e32 v0, s0 7040; GCN1-NEXT: v_mov_b32_e32 v1, s1 7041; GCN1-NEXT: v_mov_b32_e32 v2, s4 7042; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7043; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7044; GCN1-NEXT: buffer_wbinvl1_vol 7045; GCN1-NEXT: s_endpgm 7046; 7047; GCN2-LABEL: atomic_inc_i32_incr64_offset: 7048; GCN2: ; %bb.0: ; %entry 7049; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7050; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7051; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 7052; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7053; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7054; GCN2-NEXT: s_add_u32 s0, s2, s0 7055; GCN2-NEXT: s_addc_u32 s1, s3, s1 7056; GCN2-NEXT: s_add_u32 s0, s0, 16 7057; GCN2-NEXT: s_addc_u32 s1, s1, 0 7058; GCN2-NEXT: v_mov_b32_e32 v0, s0 7059; GCN2-NEXT: v_mov_b32_e32 v1, s1 7060; GCN2-NEXT: v_mov_b32_e32 v2, s4 7061; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7062; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7063; GCN2-NEXT: buffer_wbinvl1_vol 7064; GCN2-NEXT: s_endpgm 7065; 7066; GCN3-LABEL: atomic_inc_i32_incr64_offset: 7067; GCN3: ; %bb.0: ; %entry 7068; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7069; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7070; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 7071; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7072; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7073; GCN3-NEXT: s_add_u32 s0, s2, s0 7074; GCN3-NEXT: s_addc_u32 s1, s3, s1 7075; GCN3-NEXT: v_mov_b32_e32 v0, s0 7076; GCN3-NEXT: v_mov_b32_e32 v1, s1 7077; GCN3-NEXT: v_mov_b32_e32 v2, s6 7078; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 7079; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7080; GCN3-NEXT: buffer_wbinvl1_vol 7081; GCN3-NEXT: s_endpgm 7082entry: 7083 %ptr = getelementptr i32, ptr %out, i64 %index 7084 %gep = getelementptr i32, ptr %ptr, i32 4 7085 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7086 ret void 7087} 7088 7089define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 7090; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: 7091; GCN1: ; %bb.0: ; %entry 7092; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 7093; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7094; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 7095; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7096; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7097; GCN1-NEXT: s_add_u32 s0, s0, s4 7098; GCN1-NEXT: s_addc_u32 s1, s1, s5 7099; GCN1-NEXT: s_add_u32 s0, s0, 16 7100; GCN1-NEXT: s_addc_u32 s1, s1, 0 7101; GCN1-NEXT: v_mov_b32_e32 v0, s0 7102; GCN1-NEXT: v_mov_b32_e32 v1, s1 7103; GCN1-NEXT: v_mov_b32_e32 v2, s8 7104; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7105; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7106; GCN1-NEXT: buffer_wbinvl1_vol 7107; GCN1-NEXT: v_mov_b32_e32 v0, s2 7108; GCN1-NEXT: v_mov_b32_e32 v1, s3 7109; GCN1-NEXT: flat_store_dword v[0:1], v2 7110; GCN1-NEXT: s_endpgm 7111; 7112; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: 7113; GCN2: ; %bb.0: ; %entry 7114; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7115; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7116; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 7117; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7118; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7119; GCN2-NEXT: s_add_u32 s0, s0, s4 7120; GCN2-NEXT: s_addc_u32 s1, s1, s5 7121; GCN2-NEXT: s_add_u32 s0, s0, 16 7122; GCN2-NEXT: s_addc_u32 s1, s1, 0 7123; GCN2-NEXT: v_mov_b32_e32 v0, s0 7124; GCN2-NEXT: v_mov_b32_e32 v1, s1 7125; GCN2-NEXT: v_mov_b32_e32 v2, s8 7126; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7127; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7128; GCN2-NEXT: buffer_wbinvl1_vol 7129; GCN2-NEXT: v_mov_b32_e32 v0, s2 7130; GCN2-NEXT: v_mov_b32_e32 v1, s3 7131; GCN2-NEXT: flat_store_dword v[0:1], v2 7132; GCN2-NEXT: s_endpgm 7133; 7134; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: 7135; GCN3: ; %bb.0: ; %entry 7136; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7137; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7138; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 7139; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7140; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7141; GCN3-NEXT: s_add_u32 s0, s0, s4 7142; GCN3-NEXT: s_addc_u32 s1, s1, s5 7143; GCN3-NEXT: v_mov_b32_e32 v0, s0 7144; GCN3-NEXT: v_mov_b32_e32 v1, s1 7145; GCN3-NEXT: v_mov_b32_e32 v2, s8 7146; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc 7147; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7148; GCN3-NEXT: buffer_wbinvl1_vol 7149; GCN3-NEXT: v_mov_b32_e32 v0, s2 7150; GCN3-NEXT: v_mov_b32_e32 v1, s3 7151; GCN3-NEXT: flat_store_dword v[0:1], v2 7152; GCN3-NEXT: s_endpgm 7153entry: 7154 %ptr = getelementptr i32, ptr %out, i64 %index 7155 %gep = getelementptr i32, ptr %ptr, i32 4 7156 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7157 store i32 %val, ptr %out2 7158 ret void 7159} 7160 7161define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { 7162; GCN1-LABEL: atomic_inc_i32: 7163; GCN1: ; %bb.0: ; %entry 7164; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7165; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 7166; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7167; GCN1-NEXT: v_mov_b32_e32 v0, s0 7168; GCN1-NEXT: v_mov_b32_e32 v1, s1 7169; GCN1-NEXT: v_mov_b32_e32 v2, s2 7170; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7171; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7172; GCN1-NEXT: buffer_wbinvl1_vol 7173; GCN1-NEXT: s_endpgm 7174; 7175; GCN2-LABEL: atomic_inc_i32: 7176; GCN2: ; %bb.0: ; %entry 7177; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7178; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 7179; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7180; GCN2-NEXT: v_mov_b32_e32 v0, s0 7181; GCN2-NEXT: v_mov_b32_e32 v1, s1 7182; GCN2-NEXT: v_mov_b32_e32 v2, s2 7183; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7184; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7185; GCN2-NEXT: buffer_wbinvl1_vol 7186; GCN2-NEXT: s_endpgm 7187; 7188; GCN3-LABEL: atomic_inc_i32: 7189; GCN3: ; %bb.0: ; %entry 7190; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7191; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 7192; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7193; GCN3-NEXT: v_mov_b32_e32 v0, s0 7194; GCN3-NEXT: v_mov_b32_e32 v1, s1 7195; GCN3-NEXT: v_mov_b32_e32 v2, s2 7196; GCN3-NEXT: flat_atomic_inc v[0:1], v2 7197; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7198; GCN3-NEXT: buffer_wbinvl1_vol 7199; GCN3-NEXT: s_endpgm 7200entry: 7201 %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst 7202 ret void 7203} 7204 7205define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { 7206; GCN1-LABEL: atomic_inc_i32_ret: 7207; GCN1: ; %bb.0: ; %entry 7208; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7209; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 7210; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7211; GCN1-NEXT: v_mov_b32_e32 v0, s0 7212; GCN1-NEXT: v_mov_b32_e32 v1, s1 7213; GCN1-NEXT: v_mov_b32_e32 v2, s4 7214; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7215; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7216; GCN1-NEXT: buffer_wbinvl1_vol 7217; GCN1-NEXT: v_mov_b32_e32 v0, s2 7218; GCN1-NEXT: v_mov_b32_e32 v1, s3 7219; GCN1-NEXT: flat_store_dword v[0:1], v2 7220; GCN1-NEXT: s_endpgm 7221; 7222; GCN2-LABEL: atomic_inc_i32_ret: 7223; GCN2: ; %bb.0: ; %entry 7224; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7225; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 7226; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7227; GCN2-NEXT: v_mov_b32_e32 v0, s0 7228; GCN2-NEXT: v_mov_b32_e32 v1, s1 7229; GCN2-NEXT: v_mov_b32_e32 v2, s4 7230; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7231; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7232; GCN2-NEXT: buffer_wbinvl1_vol 7233; GCN2-NEXT: v_mov_b32_e32 v0, s2 7234; GCN2-NEXT: v_mov_b32_e32 v1, s3 7235; GCN2-NEXT: flat_store_dword v[0:1], v2 7236; GCN2-NEXT: s_endpgm 7237; 7238; GCN3-LABEL: atomic_inc_i32_ret: 7239; GCN3: ; %bb.0: ; %entry 7240; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7241; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 7242; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7243; GCN3-NEXT: v_mov_b32_e32 v0, s0 7244; GCN3-NEXT: v_mov_b32_e32 v1, s1 7245; GCN3-NEXT: v_mov_b32_e32 v2, s6 7246; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7247; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7248; GCN3-NEXT: buffer_wbinvl1_vol 7249; GCN3-NEXT: v_mov_b32_e32 v0, s2 7250; GCN3-NEXT: v_mov_b32_e32 v1, s3 7251; GCN3-NEXT: flat_store_dword v[0:1], v2 7252; GCN3-NEXT: s_endpgm 7253entry: 7254 %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst 7255 store i32 %val, ptr %out2 7256 ret void 7257} 7258 7259define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { 7260; GCN1-LABEL: atomic_inc_i32_incr64: 7261; GCN1: ; %bb.0: ; %entry 7262; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 7263; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 7264; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 7265; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7266; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7267; GCN1-NEXT: s_add_u32 s0, s2, s0 7268; GCN1-NEXT: s_addc_u32 s1, s3, s1 7269; GCN1-NEXT: v_mov_b32_e32 v0, s0 7270; GCN1-NEXT: v_mov_b32_e32 v1, s1 7271; GCN1-NEXT: v_mov_b32_e32 v2, s4 7272; GCN1-NEXT: flat_atomic_inc v[0:1], v2 7273; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7274; GCN1-NEXT: buffer_wbinvl1_vol 7275; GCN1-NEXT: s_endpgm 7276; 7277; GCN2-LABEL: atomic_inc_i32_incr64: 7278; GCN2: ; %bb.0: ; %entry 7279; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7280; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7281; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 7282; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7283; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7284; GCN2-NEXT: s_add_u32 s0, s2, s0 7285; GCN2-NEXT: s_addc_u32 s1, s3, s1 7286; GCN2-NEXT: v_mov_b32_e32 v0, s0 7287; GCN2-NEXT: v_mov_b32_e32 v1, s1 7288; GCN2-NEXT: v_mov_b32_e32 v2, s4 7289; GCN2-NEXT: flat_atomic_inc v[0:1], v2 7290; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7291; GCN2-NEXT: buffer_wbinvl1_vol 7292; GCN2-NEXT: s_endpgm 7293; 7294; GCN3-LABEL: atomic_inc_i32_incr64: 7295; GCN3: ; %bb.0: ; %entry 7296; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7297; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7298; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 7299; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7300; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7301; GCN3-NEXT: s_add_u32 s0, s2, s0 7302; GCN3-NEXT: s_addc_u32 s1, s3, s1 7303; GCN3-NEXT: v_mov_b32_e32 v0, s0 7304; GCN3-NEXT: v_mov_b32_e32 v1, s1 7305; GCN3-NEXT: v_mov_b32_e32 v2, s6 7306; GCN3-NEXT: flat_atomic_inc v[0:1], v2 7307; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7308; GCN3-NEXT: buffer_wbinvl1_vol 7309; GCN3-NEXT: s_endpgm 7310entry: 7311 %ptr = getelementptr i32, ptr %out, i64 %index 7312 %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst 7313 ret void 7314} 7315 7316define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 7317; GCN1-LABEL: atomic_inc_i32_ret_incr64: 7318; GCN1: ; %bb.0: ; %entry 7319; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 7320; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7321; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 7322; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7323; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7324; GCN1-NEXT: s_add_u32 s0, s0, s4 7325; GCN1-NEXT: s_addc_u32 s1, s1, s5 7326; GCN1-NEXT: v_mov_b32_e32 v0, s0 7327; GCN1-NEXT: v_mov_b32_e32 v1, s1 7328; GCN1-NEXT: v_mov_b32_e32 v2, s8 7329; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7330; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7331; GCN1-NEXT: buffer_wbinvl1_vol 7332; GCN1-NEXT: v_mov_b32_e32 v0, s2 7333; GCN1-NEXT: v_mov_b32_e32 v1, s3 7334; GCN1-NEXT: flat_store_dword v[0:1], v2 7335; GCN1-NEXT: s_endpgm 7336; 7337; GCN2-LABEL: atomic_inc_i32_ret_incr64: 7338; GCN2: ; %bb.0: ; %entry 7339; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7340; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7341; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 7342; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7343; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7344; GCN2-NEXT: s_add_u32 s0, s0, s4 7345; GCN2-NEXT: s_addc_u32 s1, s1, s5 7346; GCN2-NEXT: v_mov_b32_e32 v0, s0 7347; GCN2-NEXT: v_mov_b32_e32 v1, s1 7348; GCN2-NEXT: v_mov_b32_e32 v2, s8 7349; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7350; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7351; GCN2-NEXT: buffer_wbinvl1_vol 7352; GCN2-NEXT: v_mov_b32_e32 v0, s2 7353; GCN2-NEXT: v_mov_b32_e32 v1, s3 7354; GCN2-NEXT: flat_store_dword v[0:1], v2 7355; GCN2-NEXT: s_endpgm 7356; 7357; GCN3-LABEL: atomic_inc_i32_ret_incr64: 7358; GCN3: ; %bb.0: ; %entry 7359; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7360; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7361; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 7362; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7363; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7364; GCN3-NEXT: s_add_u32 s0, s0, s4 7365; GCN3-NEXT: s_addc_u32 s1, s1, s5 7366; GCN3-NEXT: v_mov_b32_e32 v0, s0 7367; GCN3-NEXT: v_mov_b32_e32 v1, s1 7368; GCN3-NEXT: v_mov_b32_e32 v2, s8 7369; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 7370; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7371; GCN3-NEXT: buffer_wbinvl1_vol 7372; GCN3-NEXT: v_mov_b32_e32 v0, s2 7373; GCN3-NEXT: v_mov_b32_e32 v1, s3 7374; GCN3-NEXT: flat_store_dword v[0:1], v2 7375; GCN3-NEXT: s_endpgm 7376entry: 7377 %ptr = getelementptr i32, ptr %out, i64 %index 7378 %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst 7379 store i32 %val, ptr %out2 7380 ret void 7381} 7382 7383define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { 7384; GCN1-LABEL: atomic_dec_i32_offset: 7385; GCN1: ; %bb.0: ; %entry 7386; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7387; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 7388; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7389; GCN1-NEXT: s_add_u32 s0, s0, 16 7390; GCN1-NEXT: s_addc_u32 s1, s1, 0 7391; GCN1-NEXT: v_mov_b32_e32 v0, s0 7392; GCN1-NEXT: v_mov_b32_e32 v1, s1 7393; GCN1-NEXT: v_mov_b32_e32 v2, s2 7394; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7395; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7396; GCN1-NEXT: buffer_wbinvl1_vol 7397; GCN1-NEXT: s_endpgm 7398; 7399; GCN2-LABEL: atomic_dec_i32_offset: 7400; GCN2: ; %bb.0: ; %entry 7401; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7402; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 7403; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7404; GCN2-NEXT: s_add_u32 s0, s0, 16 7405; GCN2-NEXT: s_addc_u32 s1, s1, 0 7406; GCN2-NEXT: v_mov_b32_e32 v0, s0 7407; GCN2-NEXT: v_mov_b32_e32 v1, s1 7408; GCN2-NEXT: v_mov_b32_e32 v2, s2 7409; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7410; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7411; GCN2-NEXT: buffer_wbinvl1_vol 7412; GCN2-NEXT: s_endpgm 7413; 7414; GCN3-LABEL: atomic_dec_i32_offset: 7415; GCN3: ; %bb.0: ; %entry 7416; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7417; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 7418; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7419; GCN3-NEXT: v_mov_b32_e32 v0, s0 7420; GCN3-NEXT: v_mov_b32_e32 v1, s1 7421; GCN3-NEXT: v_mov_b32_e32 v2, s2 7422; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 7423; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7424; GCN3-NEXT: buffer_wbinvl1_vol 7425; GCN3-NEXT: s_endpgm 7426entry: 7427 %gep = getelementptr i32, ptr %out, i32 4 7428 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7429 ret void 7430} 7431 7432define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { 7433; GCN1-LABEL: atomic_dec_i32_max_offset: 7434; GCN1: ; %bb.0: ; %entry 7435; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7436; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 7437; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7438; GCN1-NEXT: s_add_u32 s0, s0, 0xffc 7439; GCN1-NEXT: s_addc_u32 s1, s1, 0 7440; GCN1-NEXT: v_mov_b32_e32 v0, s0 7441; GCN1-NEXT: v_mov_b32_e32 v1, s1 7442; GCN1-NEXT: v_mov_b32_e32 v2, s2 7443; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7444; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7445; GCN1-NEXT: buffer_wbinvl1_vol 7446; GCN1-NEXT: s_endpgm 7447; 7448; GCN2-LABEL: atomic_dec_i32_max_offset: 7449; GCN2: ; %bb.0: ; %entry 7450; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7451; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 7452; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7453; GCN2-NEXT: s_add_u32 s0, s0, 0xffc 7454; GCN2-NEXT: s_addc_u32 s1, s1, 0 7455; GCN2-NEXT: v_mov_b32_e32 v0, s0 7456; GCN2-NEXT: v_mov_b32_e32 v1, s1 7457; GCN2-NEXT: v_mov_b32_e32 v2, s2 7458; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7459; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7460; GCN2-NEXT: buffer_wbinvl1_vol 7461; GCN2-NEXT: s_endpgm 7462; 7463; GCN3-LABEL: atomic_dec_i32_max_offset: 7464; GCN3: ; %bb.0: ; %entry 7465; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7466; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 7467; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7468; GCN3-NEXT: v_mov_b32_e32 v0, s0 7469; GCN3-NEXT: v_mov_b32_e32 v1, s1 7470; GCN3-NEXT: v_mov_b32_e32 v2, s2 7471; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 7472; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7473; GCN3-NEXT: buffer_wbinvl1_vol 7474; GCN3-NEXT: s_endpgm 7475entry: 7476 %gep = getelementptr i32, ptr %out, i32 1023 7477 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7478 ret void 7479} 7480 7481define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { 7482; GCN1-LABEL: atomic_dec_i32_max_offset_p1: 7483; GCN1: ; %bb.0: ; %entry 7484; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7485; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 7486; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7487; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 7488; GCN1-NEXT: s_addc_u32 s1, s1, 0 7489; GCN1-NEXT: v_mov_b32_e32 v0, s0 7490; GCN1-NEXT: v_mov_b32_e32 v1, s1 7491; GCN1-NEXT: v_mov_b32_e32 v2, s2 7492; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7493; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7494; GCN1-NEXT: buffer_wbinvl1_vol 7495; GCN1-NEXT: s_endpgm 7496; 7497; GCN2-LABEL: atomic_dec_i32_max_offset_p1: 7498; GCN2: ; %bb.0: ; %entry 7499; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7500; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 7501; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7502; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 7503; GCN2-NEXT: s_addc_u32 s1, s1, 0 7504; GCN2-NEXT: v_mov_b32_e32 v0, s0 7505; GCN2-NEXT: v_mov_b32_e32 v1, s1 7506; GCN2-NEXT: v_mov_b32_e32 v2, s2 7507; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7508; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7509; GCN2-NEXT: buffer_wbinvl1_vol 7510; GCN2-NEXT: s_endpgm 7511; 7512; GCN3-LABEL: atomic_dec_i32_max_offset_p1: 7513; GCN3: ; %bb.0: ; %entry 7514; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7515; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 7516; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7517; GCN3-NEXT: v_mov_b32_e32 v0, s0 7518; GCN3-NEXT: v_mov_b32_e32 v1, s1 7519; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 7520; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7521; GCN3-NEXT: v_mov_b32_e32 v2, s2 7522; GCN3-NEXT: flat_atomic_dec v[0:1], v2 7523; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7524; GCN3-NEXT: buffer_wbinvl1_vol 7525; GCN3-NEXT: s_endpgm 7526entry: 7527 %gep = getelementptr i32, ptr %out, i32 1024 7528 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7529 ret void 7530} 7531 7532define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { 7533; GCN1-LABEL: atomic_dec_i32_ret_offset: 7534; GCN1: ; %bb.0: ; %entry 7535; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7536; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 7537; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7538; GCN1-NEXT: s_add_u32 s0, s0, 16 7539; GCN1-NEXT: s_addc_u32 s1, s1, 0 7540; GCN1-NEXT: v_mov_b32_e32 v0, s0 7541; GCN1-NEXT: v_mov_b32_e32 v1, s1 7542; GCN1-NEXT: v_mov_b32_e32 v2, s4 7543; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7544; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7545; GCN1-NEXT: buffer_wbinvl1_vol 7546; GCN1-NEXT: v_mov_b32_e32 v0, s2 7547; GCN1-NEXT: v_mov_b32_e32 v1, s3 7548; GCN1-NEXT: flat_store_dword v[0:1], v2 7549; GCN1-NEXT: s_endpgm 7550; 7551; GCN2-LABEL: atomic_dec_i32_ret_offset: 7552; GCN2: ; %bb.0: ; %entry 7553; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7554; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 7555; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7556; GCN2-NEXT: s_add_u32 s0, s0, 16 7557; GCN2-NEXT: s_addc_u32 s1, s1, 0 7558; GCN2-NEXT: v_mov_b32_e32 v0, s0 7559; GCN2-NEXT: v_mov_b32_e32 v1, s1 7560; GCN2-NEXT: v_mov_b32_e32 v2, s4 7561; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7562; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7563; GCN2-NEXT: buffer_wbinvl1_vol 7564; GCN2-NEXT: v_mov_b32_e32 v0, s2 7565; GCN2-NEXT: v_mov_b32_e32 v1, s3 7566; GCN2-NEXT: flat_store_dword v[0:1], v2 7567; GCN2-NEXT: s_endpgm 7568; 7569; GCN3-LABEL: atomic_dec_i32_ret_offset: 7570; GCN3: ; %bb.0: ; %entry 7571; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7572; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 7573; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7574; GCN3-NEXT: v_mov_b32_e32 v0, s0 7575; GCN3-NEXT: v_mov_b32_e32 v1, s1 7576; GCN3-NEXT: v_mov_b32_e32 v2, s6 7577; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc 7578; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7579; GCN3-NEXT: buffer_wbinvl1_vol 7580; GCN3-NEXT: v_mov_b32_e32 v0, s2 7581; GCN3-NEXT: v_mov_b32_e32 v1, s3 7582; GCN3-NEXT: flat_store_dword v[0:1], v2 7583; GCN3-NEXT: s_endpgm 7584entry: 7585 %gep = getelementptr i32, ptr %out, i32 4 7586 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7587 store i32 %val, ptr %out2 7588 ret void 7589} 7590 7591define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { 7592; GCN1-LABEL: atomic_dec_i32_decr64_offset: 7593; GCN1: ; %bb.0: ; %entry 7594; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 7595; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 7596; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 7597; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7598; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7599; GCN1-NEXT: s_add_u32 s0, s2, s0 7600; GCN1-NEXT: s_addc_u32 s1, s3, s1 7601; GCN1-NEXT: s_add_u32 s0, s0, 16 7602; GCN1-NEXT: s_addc_u32 s1, s1, 0 7603; GCN1-NEXT: v_mov_b32_e32 v0, s0 7604; GCN1-NEXT: v_mov_b32_e32 v1, s1 7605; GCN1-NEXT: v_mov_b32_e32 v2, s4 7606; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7607; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7608; GCN1-NEXT: buffer_wbinvl1_vol 7609; GCN1-NEXT: s_endpgm 7610; 7611; GCN2-LABEL: atomic_dec_i32_decr64_offset: 7612; GCN2: ; %bb.0: ; %entry 7613; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7614; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7615; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 7616; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7617; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7618; GCN2-NEXT: s_add_u32 s0, s2, s0 7619; GCN2-NEXT: s_addc_u32 s1, s3, s1 7620; GCN2-NEXT: s_add_u32 s0, s0, 16 7621; GCN2-NEXT: s_addc_u32 s1, s1, 0 7622; GCN2-NEXT: v_mov_b32_e32 v0, s0 7623; GCN2-NEXT: v_mov_b32_e32 v1, s1 7624; GCN2-NEXT: v_mov_b32_e32 v2, s4 7625; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7626; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7627; GCN2-NEXT: buffer_wbinvl1_vol 7628; GCN2-NEXT: s_endpgm 7629; 7630; GCN3-LABEL: atomic_dec_i32_decr64_offset: 7631; GCN3: ; %bb.0: ; %entry 7632; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7633; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7634; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 7635; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7636; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7637; GCN3-NEXT: s_add_u32 s0, s2, s0 7638; GCN3-NEXT: s_addc_u32 s1, s3, s1 7639; GCN3-NEXT: v_mov_b32_e32 v0, s0 7640; GCN3-NEXT: v_mov_b32_e32 v1, s1 7641; GCN3-NEXT: v_mov_b32_e32 v2, s6 7642; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 7643; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7644; GCN3-NEXT: buffer_wbinvl1_vol 7645; GCN3-NEXT: s_endpgm 7646entry: 7647 %ptr = getelementptr i32, ptr %out, i64 %index 7648 %gep = getelementptr i32, ptr %ptr, i32 4 7649 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7650 ret void 7651} 7652 7653define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { 7654; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: 7655; GCN1: ; %bb.0: ; %entry 7656; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 7657; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7658; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 7659; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7660; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7661; GCN1-NEXT: s_add_u32 s0, s0, s4 7662; GCN1-NEXT: s_addc_u32 s1, s1, s5 7663; GCN1-NEXT: s_add_u32 s0, s0, 16 7664; GCN1-NEXT: s_addc_u32 s1, s1, 0 7665; GCN1-NEXT: v_mov_b32_e32 v0, s0 7666; GCN1-NEXT: v_mov_b32_e32 v1, s1 7667; GCN1-NEXT: v_mov_b32_e32 v2, s8 7668; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7669; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7670; GCN1-NEXT: buffer_wbinvl1_vol 7671; GCN1-NEXT: v_mov_b32_e32 v0, s2 7672; GCN1-NEXT: v_mov_b32_e32 v1, s3 7673; GCN1-NEXT: flat_store_dword v[0:1], v2 7674; GCN1-NEXT: s_endpgm 7675; 7676; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: 7677; GCN2: ; %bb.0: ; %entry 7678; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7679; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7680; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 7681; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7682; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7683; GCN2-NEXT: s_add_u32 s0, s0, s4 7684; GCN2-NEXT: s_addc_u32 s1, s1, s5 7685; GCN2-NEXT: s_add_u32 s0, s0, 16 7686; GCN2-NEXT: s_addc_u32 s1, s1, 0 7687; GCN2-NEXT: v_mov_b32_e32 v0, s0 7688; GCN2-NEXT: v_mov_b32_e32 v1, s1 7689; GCN2-NEXT: v_mov_b32_e32 v2, s8 7690; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7691; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7692; GCN2-NEXT: buffer_wbinvl1_vol 7693; GCN2-NEXT: v_mov_b32_e32 v0, s2 7694; GCN2-NEXT: v_mov_b32_e32 v1, s3 7695; GCN2-NEXT: flat_store_dword v[0:1], v2 7696; GCN2-NEXT: s_endpgm 7697; 7698; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: 7699; GCN3: ; %bb.0: ; %entry 7700; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7701; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7702; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 7703; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7704; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7705; GCN3-NEXT: s_add_u32 s0, s0, s4 7706; GCN3-NEXT: s_addc_u32 s1, s1, s5 7707; GCN3-NEXT: v_mov_b32_e32 v0, s0 7708; GCN3-NEXT: v_mov_b32_e32 v1, s1 7709; GCN3-NEXT: v_mov_b32_e32 v2, s8 7710; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc 7711; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7712; GCN3-NEXT: buffer_wbinvl1_vol 7713; GCN3-NEXT: v_mov_b32_e32 v0, s2 7714; GCN3-NEXT: v_mov_b32_e32 v1, s3 7715; GCN3-NEXT: flat_store_dword v[0:1], v2 7716; GCN3-NEXT: s_endpgm 7717entry: 7718 %ptr = getelementptr i32, ptr %out, i64 %index 7719 %gep = getelementptr i32, ptr %ptr, i32 4 7720 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst 7721 store i32 %val, ptr %out2 7722 ret void 7723} 7724 7725define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { 7726; GCN1-LABEL: atomic_dec_i32: 7727; GCN1: ; %bb.0: ; %entry 7728; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7729; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb 7730; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7731; GCN1-NEXT: v_mov_b32_e32 v0, s0 7732; GCN1-NEXT: v_mov_b32_e32 v1, s1 7733; GCN1-NEXT: v_mov_b32_e32 v2, s2 7734; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7735; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7736; GCN1-NEXT: buffer_wbinvl1_vol 7737; GCN1-NEXT: s_endpgm 7738; 7739; GCN2-LABEL: atomic_dec_i32: 7740; GCN2: ; %bb.0: ; %entry 7741; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7742; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c 7743; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7744; GCN2-NEXT: v_mov_b32_e32 v0, s0 7745; GCN2-NEXT: v_mov_b32_e32 v1, s1 7746; GCN2-NEXT: v_mov_b32_e32 v2, s2 7747; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7748; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7749; GCN2-NEXT: buffer_wbinvl1_vol 7750; GCN2-NEXT: s_endpgm 7751; 7752; GCN3-LABEL: atomic_dec_i32: 7753; GCN3: ; %bb.0: ; %entry 7754; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7755; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c 7756; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7757; GCN3-NEXT: v_mov_b32_e32 v0, s0 7758; GCN3-NEXT: v_mov_b32_e32 v1, s1 7759; GCN3-NEXT: v_mov_b32_e32 v2, s2 7760; GCN3-NEXT: flat_atomic_dec v[0:1], v2 7761; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7762; GCN3-NEXT: buffer_wbinvl1_vol 7763; GCN3-NEXT: s_endpgm 7764entry: 7765 %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst 7766 ret void 7767} 7768 7769define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { 7770; GCN1-LABEL: atomic_dec_i32_ret: 7771; GCN1: ; %bb.0: ; %entry 7772; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7773; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd 7774; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7775; GCN1-NEXT: v_mov_b32_e32 v0, s0 7776; GCN1-NEXT: v_mov_b32_e32 v1, s1 7777; GCN1-NEXT: v_mov_b32_e32 v2, s4 7778; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7779; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7780; GCN1-NEXT: buffer_wbinvl1_vol 7781; GCN1-NEXT: v_mov_b32_e32 v0, s2 7782; GCN1-NEXT: v_mov_b32_e32 v1, s3 7783; GCN1-NEXT: flat_store_dword v[0:1], v2 7784; GCN1-NEXT: s_endpgm 7785; 7786; GCN2-LABEL: atomic_dec_i32_ret: 7787; GCN2: ; %bb.0: ; %entry 7788; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7789; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 7790; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7791; GCN2-NEXT: v_mov_b32_e32 v0, s0 7792; GCN2-NEXT: v_mov_b32_e32 v1, s1 7793; GCN2-NEXT: v_mov_b32_e32 v2, s4 7794; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7795; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7796; GCN2-NEXT: buffer_wbinvl1_vol 7797; GCN2-NEXT: v_mov_b32_e32 v0, s2 7798; GCN2-NEXT: v_mov_b32_e32 v1, s3 7799; GCN2-NEXT: flat_store_dword v[0:1], v2 7800; GCN2-NEXT: s_endpgm 7801; 7802; GCN3-LABEL: atomic_dec_i32_ret: 7803; GCN3: ; %bb.0: ; %entry 7804; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7805; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 7806; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7807; GCN3-NEXT: v_mov_b32_e32 v0, s0 7808; GCN3-NEXT: v_mov_b32_e32 v1, s1 7809; GCN3-NEXT: v_mov_b32_e32 v2, s6 7810; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7811; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7812; GCN3-NEXT: buffer_wbinvl1_vol 7813; GCN3-NEXT: v_mov_b32_e32 v0, s2 7814; GCN3-NEXT: v_mov_b32_e32 v1, s3 7815; GCN3-NEXT: flat_store_dword v[0:1], v2 7816; GCN3-NEXT: s_endpgm 7817entry: 7818 %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst 7819 store i32 %val, ptr %out2 7820 ret void 7821} 7822 7823define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { 7824; GCN1-LABEL: atomic_dec_i32_decr64: 7825; GCN1: ; %bb.0: ; %entry 7826; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 7827; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 7828; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb 7829; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7830; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7831; GCN1-NEXT: s_add_u32 s0, s2, s0 7832; GCN1-NEXT: s_addc_u32 s1, s3, s1 7833; GCN1-NEXT: v_mov_b32_e32 v0, s0 7834; GCN1-NEXT: v_mov_b32_e32 v1, s1 7835; GCN1-NEXT: v_mov_b32_e32 v2, s4 7836; GCN1-NEXT: flat_atomic_dec v[0:1], v2 7837; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7838; GCN1-NEXT: buffer_wbinvl1_vol 7839; GCN1-NEXT: s_endpgm 7840; 7841; GCN2-LABEL: atomic_dec_i32_decr64: 7842; GCN2: ; %bb.0: ; %entry 7843; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7844; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7845; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c 7846; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7847; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7848; GCN2-NEXT: s_add_u32 s0, s2, s0 7849; GCN2-NEXT: s_addc_u32 s1, s3, s1 7850; GCN2-NEXT: v_mov_b32_e32 v0, s0 7851; GCN2-NEXT: v_mov_b32_e32 v1, s1 7852; GCN2-NEXT: v_mov_b32_e32 v2, s4 7853; GCN2-NEXT: flat_atomic_dec v[0:1], v2 7854; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7855; GCN2-NEXT: buffer_wbinvl1_vol 7856; GCN2-NEXT: s_endpgm 7857; 7858; GCN3-LABEL: atomic_dec_i32_decr64: 7859; GCN3: ; %bb.0: ; %entry 7860; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 7861; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7862; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c 7863; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7864; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 7865; GCN3-NEXT: s_add_u32 s0, s2, s0 7866; GCN3-NEXT: s_addc_u32 s1, s3, s1 7867; GCN3-NEXT: v_mov_b32_e32 v0, s0 7868; GCN3-NEXT: v_mov_b32_e32 v1, s1 7869; GCN3-NEXT: v_mov_b32_e32 v2, s6 7870; GCN3-NEXT: flat_atomic_dec v[0:1], v2 7871; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7872; GCN3-NEXT: buffer_wbinvl1_vol 7873; GCN3-NEXT: s_endpgm 7874entry: 7875 %ptr = getelementptr i32, ptr %out, i64 %index 7876 %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst 7877 ret void 7878} 7879 7880define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { 7881; GCN1-LABEL: atomic_dec_i32_ret_decr64: 7882; GCN1: ; %bb.0: ; %entry 7883; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf 7884; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7885; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd 7886; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7887; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7888; GCN1-NEXT: s_add_u32 s0, s0, s4 7889; GCN1-NEXT: s_addc_u32 s1, s1, s5 7890; GCN1-NEXT: v_mov_b32_e32 v0, s0 7891; GCN1-NEXT: v_mov_b32_e32 v1, s1 7892; GCN1-NEXT: v_mov_b32_e32 v2, s8 7893; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7894; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7895; GCN1-NEXT: buffer_wbinvl1_vol 7896; GCN1-NEXT: v_mov_b32_e32 v0, s2 7897; GCN1-NEXT: v_mov_b32_e32 v1, s3 7898; GCN1-NEXT: flat_store_dword v[0:1], v2 7899; GCN1-NEXT: s_endpgm 7900; 7901; GCN2-LABEL: atomic_dec_i32_ret_decr64: 7902; GCN2: ; %bb.0: ; %entry 7903; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7904; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7905; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 7906; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7907; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7908; GCN2-NEXT: s_add_u32 s0, s0, s4 7909; GCN2-NEXT: s_addc_u32 s1, s1, s5 7910; GCN2-NEXT: v_mov_b32_e32 v0, s0 7911; GCN2-NEXT: v_mov_b32_e32 v1, s1 7912; GCN2-NEXT: v_mov_b32_e32 v2, s8 7913; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7914; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7915; GCN2-NEXT: buffer_wbinvl1_vol 7916; GCN2-NEXT: v_mov_b32_e32 v0, s2 7917; GCN2-NEXT: v_mov_b32_e32 v1, s3 7918; GCN2-NEXT: flat_store_dword v[0:1], v2 7919; GCN2-NEXT: s_endpgm 7920; 7921; GCN3-LABEL: atomic_dec_i32_ret_decr64: 7922; GCN3: ; %bb.0: ; %entry 7923; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 7924; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7925; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 7926; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7927; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 7928; GCN3-NEXT: s_add_u32 s0, s0, s4 7929; GCN3-NEXT: s_addc_u32 s1, s1, s5 7930; GCN3-NEXT: v_mov_b32_e32 v0, s0 7931; GCN3-NEXT: v_mov_b32_e32 v1, s1 7932; GCN3-NEXT: v_mov_b32_e32 v2, s8 7933; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc 7934; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7935; GCN3-NEXT: buffer_wbinvl1_vol 7936; GCN3-NEXT: v_mov_b32_e32 v0, s2 7937; GCN3-NEXT: v_mov_b32_e32 v1, s3 7938; GCN3-NEXT: flat_store_dword v[0:1], v2 7939; GCN3-NEXT: s_endpgm 7940entry: 7941 %ptr = getelementptr i32, ptr %out, i64 %index 7942 %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst 7943 store i32 %val, ptr %out2 7944 ret void 7945} 7946 7947define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { 7948; GCN1-LABEL: atomic_load_f16_offset: 7949; GCN1: ; %bb.0: 7950; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7951; GCN1-NEXT: s_waitcnt lgkmcnt(0) 7952; GCN1-NEXT: s_add_u32 s0, s0, 16 7953; GCN1-NEXT: s_addc_u32 s1, s1, 0 7954; GCN1-NEXT: v_mov_b32_e32 v0, s0 7955; GCN1-NEXT: v_mov_b32_e32 v1, s1 7956; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 7957; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7958; GCN1-NEXT: buffer_wbinvl1_vol 7959; GCN1-NEXT: v_mov_b32_e32 v0, s2 7960; GCN1-NEXT: v_mov_b32_e32 v1, s3 7961; GCN1-NEXT: flat_store_short v[0:1], v2 7962; GCN1-NEXT: s_endpgm 7963; 7964; GCN2-LABEL: atomic_load_f16_offset: 7965; GCN2: ; %bb.0: 7966; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7967; GCN2-NEXT: s_waitcnt lgkmcnt(0) 7968; GCN2-NEXT: s_add_u32 s0, s0, 16 7969; GCN2-NEXT: s_addc_u32 s1, s1, 0 7970; GCN2-NEXT: v_mov_b32_e32 v0, s0 7971; GCN2-NEXT: v_mov_b32_e32 v1, s1 7972; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 7973; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7974; GCN2-NEXT: buffer_wbinvl1_vol 7975; GCN2-NEXT: v_mov_b32_e32 v0, s2 7976; GCN2-NEXT: v_mov_b32_e32 v1, s3 7977; GCN2-NEXT: flat_store_short v[0:1], v2 7978; GCN2-NEXT: s_endpgm 7979; 7980; GCN3-LABEL: atomic_load_f16_offset: 7981; GCN3: ; %bb.0: 7982; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7983; GCN3-NEXT: s_waitcnt lgkmcnt(0) 7984; GCN3-NEXT: v_mov_b32_e32 v0, s0 7985; GCN3-NEXT: v_mov_b32_e32 v1, s1 7986; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc 7987; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7988; GCN3-NEXT: buffer_wbinvl1_vol 7989; GCN3-NEXT: v_mov_b32_e32 v0, s2 7990; GCN3-NEXT: v_mov_b32_e32 v1, s3 7991; GCN3-NEXT: flat_store_short v[0:1], v2 7992; GCN3-NEXT: s_endpgm 7993 %gep = getelementptr half, ptr %in, i64 8 7994 %val = load atomic half, ptr %gep seq_cst, align 2 7995 store half %val, ptr %out 7996 ret void 7997} 7998 7999define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { 8000; GCN1-LABEL: atomic_load_f16: 8001; GCN1: ; %bb.0: 8002; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8003; GCN1-NEXT: s_waitcnt lgkmcnt(0) 8004; GCN1-NEXT: v_mov_b32_e32 v0, s0 8005; GCN1-NEXT: v_mov_b32_e32 v1, s1 8006; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 8007; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8008; GCN1-NEXT: buffer_wbinvl1_vol 8009; GCN1-NEXT: v_mov_b32_e32 v0, s2 8010; GCN1-NEXT: v_mov_b32_e32 v1, s3 8011; GCN1-NEXT: flat_store_short v[0:1], v2 8012; GCN1-NEXT: s_endpgm 8013; 8014; GCN2-LABEL: atomic_load_f16: 8015; GCN2: ; %bb.0: 8016; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8017; GCN2-NEXT: s_waitcnt lgkmcnt(0) 8018; GCN2-NEXT: v_mov_b32_e32 v0, s0 8019; GCN2-NEXT: v_mov_b32_e32 v1, s1 8020; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 8021; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8022; GCN2-NEXT: buffer_wbinvl1_vol 8023; GCN2-NEXT: v_mov_b32_e32 v0, s2 8024; GCN2-NEXT: v_mov_b32_e32 v1, s3 8025; GCN2-NEXT: flat_store_short v[0:1], v2 8026; GCN2-NEXT: s_endpgm 8027; 8028; GCN3-LABEL: atomic_load_f16: 8029; GCN3: ; %bb.0: 8030; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8031; GCN3-NEXT: s_waitcnt lgkmcnt(0) 8032; GCN3-NEXT: v_mov_b32_e32 v0, s0 8033; GCN3-NEXT: v_mov_b32_e32 v1, s1 8034; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc 8035; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8036; GCN3-NEXT: buffer_wbinvl1_vol 8037; GCN3-NEXT: v_mov_b32_e32 v0, s2 8038; GCN3-NEXT: v_mov_b32_e32 v1, s3 8039; GCN3-NEXT: flat_store_short v[0:1], v2 8040; GCN3-NEXT: s_endpgm 8041 %val = load atomic half, ptr %in seq_cst, align 2 8042 store half %val, ptr %out 8043 ret void 8044} 8045 8046define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { 8047; GCN1-LABEL: atomic_load_bf16_offset: 8048; GCN1: ; %bb.0: 8049; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8050; GCN1-NEXT: s_waitcnt lgkmcnt(0) 8051; GCN1-NEXT: s_add_u32 s0, s0, 16 8052; GCN1-NEXT: s_addc_u32 s1, s1, 0 8053; GCN1-NEXT: v_mov_b32_e32 v0, s0 8054; GCN1-NEXT: v_mov_b32_e32 v1, s1 8055; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 8056; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8057; GCN1-NEXT: buffer_wbinvl1_vol 8058; GCN1-NEXT: v_mov_b32_e32 v0, s2 8059; GCN1-NEXT: v_mov_b32_e32 v1, s3 8060; GCN1-NEXT: flat_store_short v[0:1], v2 8061; GCN1-NEXT: s_endpgm 8062; 8063; GCN2-LABEL: atomic_load_bf16_offset: 8064; GCN2: ; %bb.0: 8065; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8066; GCN2-NEXT: s_waitcnt lgkmcnt(0) 8067; GCN2-NEXT: s_add_u32 s0, s0, 16 8068; GCN2-NEXT: s_addc_u32 s1, s1, 0 8069; GCN2-NEXT: v_mov_b32_e32 v0, s0 8070; GCN2-NEXT: v_mov_b32_e32 v1, s1 8071; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 8072; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8073; GCN2-NEXT: buffer_wbinvl1_vol 8074; GCN2-NEXT: v_mov_b32_e32 v0, s2 8075; GCN2-NEXT: v_mov_b32_e32 v1, s3 8076; GCN2-NEXT: flat_store_short v[0:1], v2 8077; GCN2-NEXT: s_endpgm 8078; 8079; GCN3-LABEL: atomic_load_bf16_offset: 8080; GCN3: ; %bb.0: 8081; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8082; GCN3-NEXT: s_waitcnt lgkmcnt(0) 8083; GCN3-NEXT: v_mov_b32_e32 v0, s0 8084; GCN3-NEXT: v_mov_b32_e32 v1, s1 8085; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc 8086; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8087; GCN3-NEXT: buffer_wbinvl1_vol 8088; GCN3-NEXT: v_mov_b32_e32 v0, s2 8089; GCN3-NEXT: v_mov_b32_e32 v1, s3 8090; GCN3-NEXT: flat_store_short v[0:1], v2 8091; GCN3-NEXT: s_endpgm 8092 %gep = getelementptr bfloat, ptr %in, i64 8 8093 %val = load atomic bfloat, ptr %gep seq_cst, align 2 8094 store bfloat %val, ptr %out 8095 ret void 8096} 8097 8098define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { 8099; GCN1-LABEL: atomic_load_bf16: 8100; GCN1: ; %bb.0: 8101; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8102; GCN1-NEXT: s_waitcnt lgkmcnt(0) 8103; GCN1-NEXT: v_mov_b32_e32 v0, s0 8104; GCN1-NEXT: v_mov_b32_e32 v1, s1 8105; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc 8106; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8107; GCN1-NEXT: buffer_wbinvl1_vol 8108; GCN1-NEXT: v_mov_b32_e32 v0, s2 8109; GCN1-NEXT: v_mov_b32_e32 v1, s3 8110; GCN1-NEXT: flat_store_short v[0:1], v2 8111; GCN1-NEXT: s_endpgm 8112; 8113; GCN2-LABEL: atomic_load_bf16: 8114; GCN2: ; %bb.0: 8115; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8116; GCN2-NEXT: s_waitcnt lgkmcnt(0) 8117; GCN2-NEXT: v_mov_b32_e32 v0, s0 8118; GCN2-NEXT: v_mov_b32_e32 v1, s1 8119; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc 8120; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8121; GCN2-NEXT: buffer_wbinvl1_vol 8122; GCN2-NEXT: v_mov_b32_e32 v0, s2 8123; GCN2-NEXT: v_mov_b32_e32 v1, s3 8124; GCN2-NEXT: flat_store_short v[0:1], v2 8125; GCN2-NEXT: s_endpgm 8126; 8127; GCN3-LABEL: atomic_load_bf16: 8128; GCN3: ; %bb.0: 8129; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8130; GCN3-NEXT: s_waitcnt lgkmcnt(0) 8131; GCN3-NEXT: v_mov_b32_e32 v0, s0 8132; GCN3-NEXT: v_mov_b32_e32 v1, s1 8133; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc 8134; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8135; GCN3-NEXT: buffer_wbinvl1_vol 8136; GCN3-NEXT: v_mov_b32_e32 v0, s2 8137; GCN3-NEXT: v_mov_b32_e32 v1, s3 8138; GCN3-NEXT: flat_store_short v[0:1], v2 8139; GCN3-NEXT: s_endpgm 8140 %val = load atomic bfloat, ptr %in seq_cst, align 2 8141 store bfloat %val, ptr %out 8142 ret void 8143} 8144