1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 5; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 8 9; from atomicrmw-expand.ll 10; covers flat_load, flat_atomic (atomic with return) 11; 12define void @syncscope_workgroup_nortn(ptr %addr, float %val) { 13; GFX9-LABEL: syncscope_workgroup_nortn: 14; GFX9: ; %bb.0: 15; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GFX9-NEXT: flat_load_dword v4, v[0:1] 17; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18; GFX9-NEXT: s_mov_b64 s[4:5], 0 19; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start 20; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 21; GFX9-NEXT: v_add_f32_e32 v3, v4, v2 22; GFX9-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 23; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 24; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 25; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 26; GFX9-NEXT: v_mov_b32_e32 v4, v3 27; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 28; GFX9-NEXT: s_cbranch_execnz .LBB0_1 29; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 30; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 31; GFX9-NEXT: s_setpc_b64 s[30:31] 32; 33; GFX90A-LABEL: syncscope_workgroup_nortn: 34; GFX90A: ; %bb.0: 35; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36; GFX90A-NEXT: flat_load_dword v5, v[0:1] 37; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 38; GFX90A-NEXT: s_mov_b64 s[4:5], 0 39; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start 40; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 41; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 42; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 43; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 44; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 45; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 46; GFX90A-NEXT: v_mov_b32_e32 v5, v3 47; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 48; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 49; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 50; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 51; GFX90A-NEXT: s_setpc_b64 s[30:31] 52; 53; GFX10-LABEL: syncscope_workgroup_nortn: 54; GFX10: ; %bb.0: 55; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX10-NEXT: flat_load_dword v4, v[0:1] 57; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 58; GFX10-NEXT: s_mov_b32 s4, 0 59; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start 60; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 61; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 62; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 63; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 64; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX10-NEXT: buffer_gl0_inv 66; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 67; GFX10-NEXT: v_mov_b32_e32 v4, v3 68; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 69; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 70; GFX10-NEXT: s_cbranch_execnz .LBB0_1 71; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 72; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 73; GFX10-NEXT: s_setpc_b64 s[30:31] 74; 75; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: 76; GFX9-FLATSCR: ; %bb.0: 77; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX9-FLATSCR-NEXT: flat_load_dword v4, v[0:1] 79; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 80; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0 81; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start 82; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 83; GFX9-FLATSCR-NEXT: v_add_f32_e32 v3, v4, v2 84; GFX9-FLATSCR-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 85; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 86; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 87; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 88; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v3 89; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] 90; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB0_1 91; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end 92; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 93; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] 94; 95; GFX11-LABEL: syncscope_workgroup_nortn: 96; GFX11: ; %bb.0: 97; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 98; GFX11-NEXT: flat_load_b32 v4, v[0:1] 99; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 100; GFX11-NEXT: s_mov_b32 s0, 0 101; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start 102; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 103; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 104; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 105; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 106; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 107; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 108; GFX11-NEXT: buffer_gl0_inv 109; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 110; GFX11-NEXT: v_mov_b32_e32 v4, v3 111; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 112; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 113; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 114; GFX11-NEXT: s_cbranch_execnz .LBB0_1 115; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 116; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 117; GFX11-NEXT: s_setpc_b64 s[30:31] 118; 119; GFX12-LABEL: syncscope_workgroup_nortn: 120; GFX12: ; %bb.0: 121; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 122; GFX12-NEXT: s_wait_expcnt 0x0 123; GFX12-NEXT: s_wait_samplecnt 0x0 124; GFX12-NEXT: s_wait_bvhcnt 0x0 125; GFX12-NEXT: s_wait_kmcnt 0x0 126; GFX12-NEXT: s_wait_storecnt 0x0 127; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE 128; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 129; GFX12-NEXT: global_inv scope:SCOPE_SE 130; GFX12-NEXT: s_setpc_b64 s[30:31] 131 %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst 132 ret void 133} 134 135; from atomicrmw-nand.ll 136; covers global_atomic (atomic with return), global_load 137; 138define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { 139; GFX9-LABEL: atomic_nand_i32_global: 140; GFX9: ; %bb.0: 141; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 142; GFX9-NEXT: global_load_dword v2, v[0:1], off 143; GFX9-NEXT: s_waitcnt vmcnt(0) 144; GFX9-NEXT: s_mov_b64 s[4:5], 0 145; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start 146; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 147; GFX9-NEXT: v_mov_b32_e32 v3, v2 148; GFX9-NEXT: v_not_b32_e32 v2, v3 149; GFX9-NEXT: v_or_b32_e32 v2, -5, v2 150; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc 151; GFX9-NEXT: s_waitcnt vmcnt(0) 152; GFX9-NEXT: buffer_wbinvl1_vol 153; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 154; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 155; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 156; GFX9-NEXT: s_cbranch_execnz .LBB1_1 157; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end 158; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 159; GFX9-NEXT: v_mov_b32_e32 v0, v2 160; GFX9-NEXT: s_setpc_b64 s[30:31] 161; 162; GFX90A-LABEL: atomic_nand_i32_global: 163; GFX90A: ; %bb.0: 164; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX90A-NEXT: global_load_dword v2, v[0:1], off 166; GFX90A-NEXT: s_waitcnt vmcnt(0) 167; GFX90A-NEXT: s_mov_b64 s[4:5], 0 168; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start 169; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 170; GFX90A-NEXT: v_mov_b32_e32 v3, v2 171; GFX90A-NEXT: v_not_b32_e32 v2, v3 172; GFX90A-NEXT: v_or_b32_e32 v2, -5, v2 173; GFX90A-NEXT: buffer_wbl2 174; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc 175; GFX90A-NEXT: s_waitcnt vmcnt(0) 176; GFX90A-NEXT: buffer_invl2 177; GFX90A-NEXT: buffer_wbinvl1_vol 178; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 179; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 180; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 181; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 182; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 183; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 184; GFX90A-NEXT: v_mov_b32_e32 v0, v2 185; GFX90A-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX10-LABEL: atomic_nand_i32_global: 188; GFX10: ; %bb.0: 189; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX10-NEXT: global_load_dword v2, v[0:1], off 191; GFX10-NEXT: s_waitcnt vmcnt(0) 192; GFX10-NEXT: s_mov_b32 s4, 0 193; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start 194; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 195; GFX10-NEXT: v_mov_b32_e32 v3, v2 196; GFX10-NEXT: v_not_b32_e32 v2, v3 197; GFX10-NEXT: v_or_b32_e32 v2, -5, v2 198; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 199; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc 200; GFX10-NEXT: s_waitcnt vmcnt(0) 201; GFX10-NEXT: buffer_gl1_inv 202; GFX10-NEXT: buffer_gl0_inv 203; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 204; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 205; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 206; GFX10-NEXT: s_cbranch_execnz .LBB1_1 207; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 208; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 209; GFX10-NEXT: v_mov_b32_e32 v0, v2 210; GFX10-NEXT: s_setpc_b64 s[30:31] 211; 212; GFX9-FLATSCR-LABEL: atomic_nand_i32_global: 213; GFX9-FLATSCR: ; %bb.0: 214; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; GFX9-FLATSCR-NEXT: global_load_dword v2, v[0:1], off 216; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 217; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0 218; GFX9-FLATSCR-NEXT: .LBB1_1: ; %atomicrmw.start 219; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 220; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v2 221; GFX9-FLATSCR-NEXT: v_not_b32_e32 v2, v3 222; GFX9-FLATSCR-NEXT: v_or_b32_e32 v2, -5, v2 223; GFX9-FLATSCR-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc 224; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 225; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol 226; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 227; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 228; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] 229; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB1_1 230; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end 231; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 232; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v2 233; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] 234; 235; GFX11-LABEL: atomic_nand_i32_global: 236; GFX11: ; %bb.0: 237; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 238; GFX11-NEXT: global_load_b32 v2, v[0:1], off 239; GFX11-NEXT: s_waitcnt vmcnt(0) 240; GFX11-NEXT: s_mov_b32 s0, 0 241; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start 242; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 243; GFX11-NEXT: v_mov_b32_e32 v3, v2 244; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 245; GFX11-NEXT: v_not_b32_e32 v2, v3 246; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 247; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 248; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc 249; GFX11-NEXT: s_waitcnt vmcnt(0) 250; GFX11-NEXT: buffer_gl1_inv 251; GFX11-NEXT: buffer_gl0_inv 252; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 253; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 254; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 255; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 256; GFX11-NEXT: s_cbranch_execnz .LBB1_1 257; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 258; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 259; GFX11-NEXT: v_mov_b32_e32 v0, v2 260; GFX11-NEXT: s_setpc_b64 s[30:31] 261; 262; GFX12-LABEL: atomic_nand_i32_global: 263; GFX12: ; %bb.0: 264; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 265; GFX12-NEXT: s_wait_expcnt 0x0 266; GFX12-NEXT: s_wait_samplecnt 0x0 267; GFX12-NEXT: s_wait_bvhcnt 0x0 268; GFX12-NEXT: s_wait_kmcnt 0x0 269; GFX12-NEXT: global_load_b32 v2, v[0:1], off 270; GFX12-NEXT: s_wait_loadcnt 0x0 271; GFX12-NEXT: s_mov_b32 s0, 0 272; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start 273; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 274; GFX12-NEXT: v_mov_b32_e32 v3, v2 275; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 276; GFX12-NEXT: v_not_b32_e32 v2, v3 277; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 278; GFX12-NEXT: global_wb scope:SCOPE_SYS 279; GFX12-NEXT: s_wait_storecnt 0x0 280; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS 281; GFX12-NEXT: s_wait_loadcnt 0x0 282; GFX12-NEXT: global_inv scope:SCOPE_SYS 283; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 284; GFX12-NEXT: s_wait_alu 0xfffe 285; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 286; GFX12-NEXT: s_wait_alu 0xfffe 287; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 288; GFX12-NEXT: s_cbranch_execnz .LBB1_1 289; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 290; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 291; GFX12-NEXT: v_mov_b32_e32 v0, v2 292; GFX12-NEXT: s_wait_alu 0xfffe 293; GFX12-NEXT: s_setpc_b64 s[30:31] 294 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst 295 ret i32 %result 296} 297 298; from call-argument-types.ll 299; covers scratch_load, scratch_store, buffer_load, buffer_store 300; 301declare hidden void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16) 302define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) { 303; GFX9-LABEL: tail_call_byval_align16: 304; GFX9: ; %bb.0: ; %entry 305; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 306; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 307; GFX9-NEXT: s_waitcnt vmcnt(0) 308; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 309; GFX9-NEXT: s_waitcnt vmcnt(0) 310; GFX9-NEXT: s_getpc_b64 s[16:17] 311; GFX9-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4 312; GFX9-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12 313; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 314; GFX9-NEXT: s_waitcnt vmcnt(0) 315; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 316; GFX9-NEXT: s_waitcnt vmcnt(0) 317; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 318; GFX9-NEXT: s_waitcnt vmcnt(0) 319; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 320; GFX9-NEXT: s_waitcnt vmcnt(0) 321; GFX9-NEXT: s_setpc_b64 s[16:17] 322; 323; GFX90A-LABEL: tail_call_byval_align16: 324; GFX90A: ; %bb.0: ; %entry 325; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 326; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 327; GFX90A-NEXT: s_waitcnt vmcnt(0) 328; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 329; GFX90A-NEXT: s_waitcnt vmcnt(0) 330; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32 331; GFX90A-NEXT: s_waitcnt vmcnt(0) 332; GFX90A-NEXT: s_getpc_b64 s[16:17] 333; GFX90A-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4 334; GFX90A-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12 335; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 336; GFX90A-NEXT: s_waitcnt vmcnt(0) 337; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:16 338; GFX90A-NEXT: s_waitcnt vmcnt(0) 339; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32 340; GFX90A-NEXT: s_waitcnt vmcnt(0) 341; GFX90A-NEXT: s_setpc_b64 s[16:17] 342; 343; GFX10-LABEL: tail_call_byval_align16: 344; GFX10: ; %bb.0: ; %entry 345; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 346; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 347; GFX10-NEXT: s_waitcnt vmcnt(0) 348; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 349; GFX10-NEXT: s_waitcnt vmcnt(0) 350; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 351; GFX10-NEXT: s_waitcnt vmcnt(0) 352; GFX10-NEXT: s_getpc_b64 s[16:17] 353; GFX10-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4 354; GFX10-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12 355; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 356; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 357; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:16 358; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 359; GFX10-NEXT: buffer_store_dword v34, off, s[0:3], s32 360; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 361; GFX10-NEXT: s_setpc_b64 s[16:17] 362; 363; GFX9-FLATSCR-LABEL: tail_call_byval_align16: 364; GFX9-FLATSCR: ; %bb.0: ; %entry 365; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 366; GFX9-FLATSCR-NEXT: scratch_load_dword v32, off, s32 367; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 368; GFX9-FLATSCR-NEXT: s_getpc_b64 s[0:1] 369; GFX9-FLATSCR-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 370; GFX9-FLATSCR-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 371; GFX9-FLATSCR-NEXT: scratch_store_dword off, v32, s32 372; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 373; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[32:33], off, s32 offset:24 374; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 375; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[32:33], s32 offset:16 376; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 377; GFX9-FLATSCR-NEXT: s_setpc_b64 s[0:1] 378; 379; GFX11-LABEL: tail_call_byval_align16: 380; GFX11: ; %bb.0: ; %entry 381; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 382; GFX11-NEXT: scratch_load_b32 v32, off, s32 383; GFX11-NEXT: s_waitcnt vmcnt(0) 384; GFX11-NEXT: s_getpc_b64 s[0:1] 385; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 386; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 387; GFX11-NEXT: scratch_store_b32 off, v32, s32 388; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 389; GFX11-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24 390; GFX11-NEXT: s_waitcnt vmcnt(0) 391; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16 392; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 393; GFX11-NEXT: s_setpc_b64 s[0:1] 394; 395; GFX12-LABEL: tail_call_byval_align16: 396; GFX12: ; %bb.0: ; %entry 397; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 398; GFX12-NEXT: s_wait_expcnt 0x0 399; GFX12-NEXT: s_wait_samplecnt 0x0 400; GFX12-NEXT: s_wait_bvhcnt 0x0 401; GFX12-NEXT: s_wait_kmcnt 0x0 402; GFX12-NEXT: scratch_load_b32 v32, off, s32 403; GFX12-NEXT: s_wait_loadcnt 0x0 404; GFX12-NEXT: s_getpc_b64 s[0:1] 405; GFX12-NEXT: s_wait_alu 0xfffe 406; GFX12-NEXT: s_sext_i32_i16 s1, s1 407; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+12 408; GFX12-NEXT: s_wait_alu 0xfffe 409; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+24 410; GFX12-NEXT: scratch_store_b32 off, v32, s32 411; GFX12-NEXT: s_wait_storecnt 0x0 412; GFX12-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24 413; GFX12-NEXT: s_wait_loadcnt 0x0 414; GFX12-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16 415; GFX12-NEXT: s_wait_storecnt 0x0 416; GFX12-NEXT: s_wait_alu 0xfffe 417; GFX12-NEXT: s_setpc_b64 s[0:1] 418entry: 419 %alloca = alloca double, align 8, addrspace(5) 420 tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca) 421 ret void 422} 423 424; from udiv.ll 425; covers s_load 426; 427define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { 428; GFX9-LABEL: udiv_i32: 429; GFX9: ; %bb.0: 430; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 432; GFX9-NEXT: v_mov_b32_e32 v1, 0 433; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 434; GFX9-NEXT: s_sub_i32 s4, 0, s3 435; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 436; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 437; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 438; GFX9-NEXT: v_readfirstlane_b32 s5, v0 439; GFX9-NEXT: s_mul_i32 s4, s4, s5 440; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 441; GFX9-NEXT: s_add_i32 s5, s5, s4 442; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 443; GFX9-NEXT: s_mul_i32 s5, s4, s3 444; GFX9-NEXT: s_sub_i32 s2, s2, s5 445; GFX9-NEXT: s_add_i32 s6, s4, 1 446; GFX9-NEXT: s_sub_i32 s5, s2, s3 447; GFX9-NEXT: s_cmp_ge_u32 s2, s3 448; GFX9-NEXT: s_cselect_b32 s4, s6, s4 449; GFX9-NEXT: s_cselect_b32 s2, s5, s2 450; GFX9-NEXT: s_add_i32 s5, s4, 1 451; GFX9-NEXT: s_cmp_ge_u32 s2, s3 452; GFX9-NEXT: s_cselect_b32 s2, s5, s4 453; GFX9-NEXT: v_mov_b32_e32 v0, s2 454; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 455; GFX9-NEXT: s_waitcnt vmcnt(0) 456; GFX9-NEXT: s_endpgm 457; 458; GFX90A-LABEL: udiv_i32: 459; GFX90A: ; %bb.0: 460; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 461; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 462; GFX90A-NEXT: v_mov_b32_e32 v1, 0 463; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 464; GFX90A-NEXT: s_sub_i32 s4, 0, s3 465; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 466; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 467; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 468; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 469; GFX90A-NEXT: s_mul_i32 s4, s4, s5 470; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 471; GFX90A-NEXT: s_add_i32 s5, s5, s4 472; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 473; GFX90A-NEXT: s_mul_i32 s5, s4, s3 474; GFX90A-NEXT: s_sub_i32 s2, s2, s5 475; GFX90A-NEXT: s_add_i32 s6, s4, 1 476; GFX90A-NEXT: s_sub_i32 s5, s2, s3 477; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 478; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 479; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 480; GFX90A-NEXT: s_add_i32 s5, s4, 1 481; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 482; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 483; GFX90A-NEXT: v_mov_b32_e32 v0, s2 484; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 485; GFX90A-NEXT: s_waitcnt vmcnt(0) 486; GFX90A-NEXT: s_endpgm 487; 488; GFX10-LABEL: udiv_i32: 489; GFX10: ; %bb.0: 490; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 491; GFX10-NEXT: s_waitcnt lgkmcnt(0) 492; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 493; GFX10-NEXT: s_sub_i32 s5, 0, s3 494; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 495; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 496; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 497; GFX10-NEXT: v_readfirstlane_b32 s4, v0 498; GFX10-NEXT: v_mov_b32_e32 v0, 0 499; GFX10-NEXT: s_mul_i32 s5, s5, s4 500; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 501; GFX10-NEXT: s_add_i32 s4, s4, s5 502; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 503; GFX10-NEXT: s_mul_i32 s5, s4, s3 504; GFX10-NEXT: s_sub_i32 s2, s2, s5 505; GFX10-NEXT: s_add_i32 s5, s4, 1 506; GFX10-NEXT: s_sub_i32 s6, s2, s3 507; GFX10-NEXT: s_cmp_ge_u32 s2, s3 508; GFX10-NEXT: s_cselect_b32 s4, s5, s4 509; GFX10-NEXT: s_cselect_b32 s2, s6, s2 510; GFX10-NEXT: s_add_i32 s5, s4, 1 511; GFX10-NEXT: s_cmp_ge_u32 s2, s3 512; GFX10-NEXT: s_cselect_b32 s2, s5, s4 513; GFX10-NEXT: v_mov_b32_e32 v1, s2 514; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 515; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 516; GFX10-NEXT: s_endpgm 517; 518; GFX9-FLATSCR-LABEL: udiv_i32: 519; GFX9-FLATSCR: ; %bb.0: 520; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 521; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 522; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 523; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 524; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 525; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 526; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 527; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 528; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 529; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 530; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 531; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 532; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 533; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 534; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 535; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 536; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 537; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 538; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 539; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 540; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 541; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 542; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 543; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 544; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 545; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 546; GFX9-FLATSCR-NEXT: s_endpgm 547; 548; GFX11-LABEL: udiv_i32: 549; GFX11: ; %bb.0: 550; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 551; GFX11-NEXT: s_waitcnt lgkmcnt(0) 552; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 553; GFX11-NEXT: s_sub_i32 s5, 0, s3 554; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 555; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 556; GFX11-NEXT: s_waitcnt_depctr 0xfff 557; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 558; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 559; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 560; GFX11-NEXT: v_readfirstlane_b32 s4, v0 561; GFX11-NEXT: v_mov_b32_e32 v0, 0 562; GFX11-NEXT: s_mul_i32 s5, s5, s4 563; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 564; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5 565; GFX11-NEXT: s_add_i32 s4, s4, s5 566; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 567; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4 568; GFX11-NEXT: s_mul_i32 s5, s4, s3 569; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 570; GFX11-NEXT: s_sub_i32 s2, s2, s5 571; GFX11-NEXT: s_add_i32 s5, s4, 1 572; GFX11-NEXT: s_sub_i32 s6, s2, s3 573; GFX11-NEXT: s_cmp_ge_u32 s2, s3 574; GFX11-NEXT: s_cselect_b32 s4, s5, s4 575; GFX11-NEXT: s_cselect_b32 s2, s6, s2 576; GFX11-NEXT: s_add_i32 s5, s4, 1 577; GFX11-NEXT: s_cmp_ge_u32 s2, s3 578; GFX11-NEXT: s_cselect_b32 s2, s5, s4 579; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 580; GFX11-NEXT: v_mov_b32_e32 v1, s2 581; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 582; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 583; GFX11-NEXT: s_endpgm 584; 585; GFX12-LABEL: udiv_i32: 586; GFX12: ; %bb.0: 587; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 588; GFX12-NEXT: s_wait_kmcnt 0x0 589; GFX12-NEXT: s_cvt_f32_u32 s4, s3 590; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 591; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) 592; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 593; GFX12-NEXT: v_readfirstlane_b32 s4, v0 594; GFX12-NEXT: s_wait_alu 0xfffe 595; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) 596; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe 597; GFX12-NEXT: s_wait_alu 0xfffe 598; GFX12-NEXT: s_cvt_u32_f32 s4, s4 599; GFX12-NEXT: s_wait_alu 0xfffe 600; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) 601; GFX12-NEXT: s_mul_i32 s5, s5, s4 602; GFX12-NEXT: s_wait_alu 0xfffe 603; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5 604; GFX12-NEXT: s_wait_alu 0xfffe 605; GFX12-NEXT: s_add_co_i32 s4, s4, s5 606; GFX12-NEXT: s_wait_alu 0xfffe 607; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4 608; GFX12-NEXT: s_wait_alu 0xfffe 609; GFX12-NEXT: s_mul_i32 s5, s4, s3 610; GFX12-NEXT: s_wait_alu 0xfffe 611; GFX12-NEXT: s_sub_co_i32 s2, s2, s5 612; GFX12-NEXT: s_add_co_i32 s5, s4, 1 613; GFX12-NEXT: s_sub_co_i32 s6, s2, s3 614; GFX12-NEXT: s_cmp_ge_u32 s2, s3 615; GFX12-NEXT: s_wait_alu 0xfffe 616; GFX12-NEXT: s_cselect_b32 s4, s5, s4 617; GFX12-NEXT: s_cselect_b32 s2, s6, s2 618; GFX12-NEXT: s_wait_alu 0xfffe 619; GFX12-NEXT: s_add_co_i32 s5, s4, 1 620; GFX12-NEXT: s_cmp_ge_u32 s2, s3 621; GFX12-NEXT: s_wait_alu 0xfffe 622; GFX12-NEXT: s_cselect_b32 s2, s5, s4 623; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 624; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 625; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 626; GFX12-NEXT: s_wait_storecnt 0x0 627; GFX12-NEXT: s_endpgm 628 %r = udiv i32 %x, %y 629 store i32 %r, ptr addrspace(1) %out 630 ret void 631} 632 633declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) 634 635; from smrd.ll 636; covers s_buffer_load 637; 638define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { 639; GFX9-LABEL: smrd_sgpr_offset: 640; GFX9: ; %bb.0: ; %main_body 641; GFX9-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 642; GFX9-NEXT: s_waitcnt lgkmcnt(0) 643; GFX9-NEXT: v_mov_b32_e32 v0, s0 644; GFX9-NEXT: ; return to shader part epilog 645; 646; GFX90A-LABEL: smrd_sgpr_offset: 647; GFX90A: ; %bb.0: ; %main_body 648; GFX90A-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 649; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 650; GFX90A-NEXT: v_mov_b32_e32 v0, s0 651; GFX90A-NEXT: ; return to shader part epilog 652; 653; GFX10-LABEL: smrd_sgpr_offset: 654; GFX10: ; %bb.0: ; %main_body 655; GFX10-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 656; GFX10-NEXT: s_waitcnt lgkmcnt(0) 657; GFX10-NEXT: v_mov_b32_e32 v0, s0 658; GFX10-NEXT: ; return to shader part epilog 659; 660; GFX9-FLATSCR-LABEL: smrd_sgpr_offset: 661; GFX9-FLATSCR: ; %bb.0: ; %main_body 662; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s5 663; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s4 664; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s3 665; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s2 666; GFX9-FLATSCR-NEXT: s_buffer_load_dword s0, s[8:11], s6 offset:0x0 667; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 668; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 669; GFX9-FLATSCR-NEXT: ; return to shader part epilog 670; 671; GFX11-LABEL: smrd_sgpr_offset: 672; GFX11: ; %bb.0: ; %main_body 673; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 674; GFX11-NEXT: s_waitcnt lgkmcnt(0) 675; GFX11-NEXT: v_mov_b32_e32 v0, s0 676; GFX11-NEXT: ; return to shader part epilog 677; 678; GFX12-LABEL: smrd_sgpr_offset: 679; GFX12: ; %bb.0: ; %main_body 680; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 681; GFX12-NEXT: s_wait_kmcnt 0x0 682; GFX12-NEXT: v_mov_b32_e32 v0, s0 683; GFX12-NEXT: ; return to shader part epilog 684main_body: 685 %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) 686 ret float %r 687} 688 689; from atomic_load_add.ll 690; covers s_load, ds_add (atomic without return) 691; 692define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { 693; GFX9-LABEL: atomic_add_local: 694; GFX9: ; %bb.0: 695; GFX9-NEXT: s_mov_b64 s[0:1], exec 696; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 697; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 698; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 699; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 700; GFX9-NEXT: s_cbranch_execz .LBB5_2 701; GFX9-NEXT: ; %bb.1: 702; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 703; GFX9-NEXT: s_waitcnt lgkmcnt(0) 704; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 705; GFX9-NEXT: s_mul_i32 s0, s0, 5 706; GFX9-NEXT: v_mov_b32_e32 v1, s0 707; GFX9-NEXT: v_mov_b32_e32 v0, s2 708; GFX9-NEXT: ds_add_u32 v0, v1 709; GFX9-NEXT: s_waitcnt lgkmcnt(0) 710; GFX9-NEXT: .LBB5_2: 711; GFX9-NEXT: s_endpgm 712; 713; GFX90A-LABEL: atomic_add_local: 714; GFX90A: ; %bb.0: 715; GFX90A-NEXT: s_mov_b64 s[0:1], exec 716; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 717; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 718; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 719; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc 720; GFX90A-NEXT: s_cbranch_execz .LBB5_2 721; GFX90A-NEXT: ; %bb.1: 722; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24 723; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 724; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 725; GFX90A-NEXT: s_mul_i32 s0, s0, 5 726; GFX90A-NEXT: v_mov_b32_e32 v1, s0 727; GFX90A-NEXT: v_mov_b32_e32 v0, s2 728; GFX90A-NEXT: ds_add_u32 v0, v1 729; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 730; GFX90A-NEXT: .LBB5_2: 731; GFX90A-NEXT: s_endpgm 732; 733; GFX10-LABEL: atomic_add_local: 734; GFX10: ; %bb.0: 735; GFX10-NEXT: s_mov_b32 s0, exec_lo 736; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 737; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 738; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo 739; GFX10-NEXT: s_cbranch_execz .LBB5_2 740; GFX10-NEXT: ; %bb.1: 741; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24 742; GFX10-NEXT: s_waitcnt lgkmcnt(0) 743; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 744; GFX10-NEXT: s_mul_i32 s0, s0, 5 745; GFX10-NEXT: v_mov_b32_e32 v1, s0 746; GFX10-NEXT: v_mov_b32_e32 v0, s1 747; GFX10-NEXT: ds_add_u32 v0, v1 748; GFX10-NEXT: s_waitcnt lgkmcnt(0) 749; GFX10-NEXT: buffer_gl0_inv 750; GFX10-NEXT: .LBB5_2: 751; GFX10-NEXT: s_endpgm 752; 753; GFX9-FLATSCR-LABEL: atomic_add_local: 754; GFX9-FLATSCR: ; %bb.0: 755; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec 756; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 757; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 758; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 759; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc 760; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 761; GFX9-FLATSCR-NEXT: ; %bb.1: 762; GFX9-FLATSCR-NEXT: s_load_dword s2, s[4:5], 0x24 763; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 764; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 765; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 766; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0 767; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 768; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 769; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 770; GFX9-FLATSCR-NEXT: .LBB5_2: 771; GFX9-FLATSCR-NEXT: s_endpgm 772; 773; GFX11-LABEL: atomic_add_local: 774; GFX11: ; %bb.0: 775; GFX11-NEXT: s_mov_b32 s0, exec_lo 776; GFX11-NEXT: s_mov_b32 s1, exec_lo 777; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 778; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 779; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 780; GFX11-NEXT: s_cbranch_execz .LBB5_2 781; GFX11-NEXT: ; %bb.1: 782; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x24 783; GFX11-NEXT: s_waitcnt lgkmcnt(0) 784; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 785; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 786; GFX11-NEXT: s_mul_i32 s0, s0, 5 787; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 788; GFX11-NEXT: ds_add_u32 v0, v1 789; GFX11-NEXT: s_waitcnt lgkmcnt(0) 790; GFX11-NEXT: buffer_gl0_inv 791; GFX11-NEXT: .LBB5_2: 792; GFX11-NEXT: s_endpgm 793; 794; GFX12-LABEL: atomic_add_local: 795; GFX12: ; %bb.0: 796; GFX12-NEXT: s_mov_b32 s0, exec_lo 797; GFX12-NEXT: s_mov_b32 s1, exec_lo 798; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 799; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 800; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 801; GFX12-NEXT: s_cbranch_execz .LBB5_2 802; GFX12-NEXT: ; %bb.1: 803; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x24 804; GFX12-NEXT: s_wait_kmcnt 0x0 805; GFX12-NEXT: s_wait_alu 0xfffe 806; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 807; GFX12-NEXT: s_wait_alu 0xfffe 808; GFX12-NEXT: s_mul_i32 s0, s0, 5 809; GFX12-NEXT: s_wait_alu 0xfffe 810; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 811; GFX12-NEXT: ds_add_u32 v0, v1 812; GFX12-NEXT: s_wait_dscnt 0x0 813; GFX12-NEXT: global_inv scope:SCOPE_SE 814; GFX12-NEXT: .LBB5_2: 815; GFX12-NEXT: s_endpgm 816 %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst 817 ret void 818} 819 820; from flat_atomics_i32_system.ll 821; covers flat_atomic_swap (atomic without return) 822; 823define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { 824; GFX9-LABEL: flat_atomic_xchg_i32_noret: 825; GFX9: ; %bb.0: 826; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 827; GFX9-NEXT: flat_atomic_swap v[0:1], v2 828; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 829; GFX9-NEXT: buffer_wbinvl1_vol 830; GFX9-NEXT: s_setpc_b64 s[30:31] 831; 832; GFX90A-LABEL: flat_atomic_xchg_i32_noret: 833; GFX90A: ; %bb.0: 834; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 835; GFX90A-NEXT: buffer_wbl2 836; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 837; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 838; GFX90A-NEXT: buffer_invl2 839; GFX90A-NEXT: buffer_wbinvl1_vol 840; GFX90A-NEXT: s_setpc_b64 s[30:31] 841; 842; GFX10-LABEL: flat_atomic_xchg_i32_noret: 843; GFX10: ; %bb.0: 844; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 845; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 846; GFX10-NEXT: flat_atomic_swap v[0:1], v2 847; GFX10-NEXT: s_waitcnt lgkmcnt(0) 848; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 849; GFX10-NEXT: buffer_gl1_inv 850; GFX10-NEXT: buffer_gl0_inv 851; GFX10-NEXT: s_setpc_b64 s[30:31] 852; 853; GFX9-FLATSCR-LABEL: flat_atomic_xchg_i32_noret: 854; GFX9-FLATSCR: ; %bb.0: 855; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 856; GFX9-FLATSCR-NEXT: flat_atomic_swap v[0:1], v2 857; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 858; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol 859; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] 860; 861; GFX11-LABEL: flat_atomic_xchg_i32_noret: 862; GFX11: ; %bb.0: 863; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 864; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 865; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 866; GFX11-NEXT: s_waitcnt lgkmcnt(0) 867; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 868; GFX11-NEXT: buffer_gl1_inv 869; GFX11-NEXT: buffer_gl0_inv 870; GFX11-NEXT: s_setpc_b64 s[30:31] 871; 872; GFX12-LABEL: flat_atomic_xchg_i32_noret: 873; GFX12: ; %bb.0: 874; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 875; GFX12-NEXT: s_wait_expcnt 0x0 876; GFX12-NEXT: s_wait_samplecnt 0x0 877; GFX12-NEXT: s_wait_bvhcnt 0x0 878; GFX12-NEXT: s_wait_kmcnt 0x0 879; GFX12-NEXT: global_wb scope:SCOPE_SYS 880; GFX12-NEXT: s_wait_storecnt 0x0 881; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS 882; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 883; GFX12-NEXT: global_inv scope:SCOPE_SYS 884; GFX12-NEXT: s_setpc_b64 s[30:31] 885 %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst 886 ret void 887} 888 889; from atomic_load_add.ll 890; covers s_load, ds_add_rtn (atomic with return) 891; 892define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) { 893; GFX9-LABEL: atomic_add_ret_local: 894; GFX9: ; %bb.0: 895; GFX9-NEXT: s_mov_b64 s[2:3], exec 896; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 897; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 898; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 899; GFX9-NEXT: ; implicit-def: $vgpr1 900; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 901; GFX9-NEXT: s_cbranch_execz .LBB7_2 902; GFX9-NEXT: ; %bb.1: 903; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 904; GFX9-NEXT: s_waitcnt lgkmcnt(0) 905; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 906; GFX9-NEXT: s_mul_i32 s2, s2, 5 907; GFX9-NEXT: v_mov_b32_e32 v2, s2 908; GFX9-NEXT: v_mov_b32_e32 v1, s6 909; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 910; GFX9-NEXT: s_waitcnt lgkmcnt(0) 911; GFX9-NEXT: .LBB7_2: 912; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 913; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: v_readfirstlane_b32 s2, v1 916; GFX9-NEXT: v_mov_b32_e32 v2, 0 917; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 918; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 919; GFX9-NEXT: s_waitcnt vmcnt(0) 920; GFX9-NEXT: s_endpgm 921; 922; GFX90A-LABEL: atomic_add_ret_local: 923; GFX90A: ; %bb.0: 924; GFX90A-NEXT: s_mov_b64 s[2:3], exec 925; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 926; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 927; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 928; GFX90A-NEXT: ; implicit-def: $vgpr1 929; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc 930; GFX90A-NEXT: s_cbranch_execz .LBB7_2 931; GFX90A-NEXT: ; %bb.1: 932; GFX90A-NEXT: s_load_dword s6, s[4:5], 0x2c 933; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 934; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 935; GFX90A-NEXT: s_mul_i32 s2, s2, 5 936; GFX90A-NEXT: v_mov_b32_e32 v2, s2 937; GFX90A-NEXT: v_mov_b32_e32 v1, s6 938; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 939; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 940; GFX90A-NEXT: .LBB7_2: 941; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] 942; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 943; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 944; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 945; GFX90A-NEXT: v_mov_b32_e32 v2, 0 946; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2 947; GFX90A-NEXT: global_store_dword v2, v0, s[0:1] 948; GFX90A-NEXT: s_waitcnt vmcnt(0) 949; GFX90A-NEXT: s_endpgm 950; 951; GFX10-LABEL: atomic_add_ret_local: 952; GFX10: ; %bb.0: 953; GFX10-NEXT: s_mov_b32 s1, exec_lo 954; GFX10-NEXT: ; implicit-def: $vgpr1 955; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 956; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 957; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo 958; GFX10-NEXT: s_cbranch_execz .LBB7_2 959; GFX10-NEXT: ; %bb.1: 960; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 961; GFX10-NEXT: s_waitcnt lgkmcnt(0) 962; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 963; GFX10-NEXT: s_mul_i32 s1, s1, 5 964; GFX10-NEXT: v_mov_b32_e32 v2, s1 965; GFX10-NEXT: v_mov_b32_e32 v1, s2 966; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 967; GFX10-NEXT: s_waitcnt lgkmcnt(0) 968; GFX10-NEXT: buffer_gl0_inv 969; GFX10-NEXT: .LBB7_2: 970; GFX10-NEXT: s_waitcnt_depctr 0xffe3 971; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 972; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 973; GFX10-NEXT: s_waitcnt lgkmcnt(0) 974; GFX10-NEXT: v_readfirstlane_b32 s2, v1 975; GFX10-NEXT: v_mov_b32_e32 v1, 0 976; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2 977; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 978; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 979; GFX10-NEXT: s_endpgm 980; 981; GFX9-FLATSCR-LABEL: atomic_add_ret_local: 982; GFX9-FLATSCR: ; %bb.0: 983; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec 984; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 985; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 986; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 987; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 988; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 989; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 990; GFX9-FLATSCR-NEXT: ; %bb.1: 991; GFX9-FLATSCR-NEXT: s_load_dword s6, s[4:5], 0x2c 992; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 993; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 994; GFX9-FLATSCR-NEXT: s_mul_i32 s2, s2, 5 995; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s2 996; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6 997; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 998; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 999; GFX9-FLATSCR-NEXT: .LBB7_2: 1000; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 1001; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1002; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1003; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 1004; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 1005; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1006; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1] 1007; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 1008; GFX9-FLATSCR-NEXT: s_endpgm 1009; 1010; GFX11-LABEL: atomic_add_ret_local: 1011; GFX11: ; %bb.0: 1012; GFX11-NEXT: s_mov_b32 s1, exec_lo 1013; GFX11-NEXT: s_mov_b32 s0, exec_lo 1014; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1015; GFX11-NEXT: ; implicit-def: $vgpr1 1016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1017; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 1018; GFX11-NEXT: s_cbranch_execz .LBB7_2 1019; GFX11-NEXT: ; %bb.1: 1020; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 1021; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1022; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 1023; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1024; GFX11-NEXT: s_mul_i32 s1, s1, 5 1025; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2 1026; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 1027; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX11-NEXT: buffer_gl0_inv 1029; GFX11-NEXT: .LBB7_2: 1030; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1031; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1032; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1033; GFX11-NEXT: v_readfirstlane_b32 s2, v1 1034; GFX11-NEXT: v_mov_b32_e32 v1, 0 1035; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1036; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1037; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1038; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1039; GFX11-NEXT: s_endpgm 1040; 1041; GFX12-LABEL: atomic_add_ret_local: 1042; GFX12: ; %bb.0: 1043; GFX12-NEXT: s_mov_b32 s1, exec_lo 1044; GFX12-NEXT: s_mov_b32 s0, exec_lo 1045; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1046; GFX12-NEXT: ; implicit-def: $vgpr1 1047; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1048; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 1049; GFX12-NEXT: s_cbranch_execz .LBB7_2 1050; GFX12-NEXT: ; %bb.1: 1051; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x2c 1052; GFX12-NEXT: s_wait_kmcnt 0x0 1053; GFX12-NEXT: s_wait_alu 0xfffe 1054; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 1055; GFX12-NEXT: s_wait_alu 0xfffe 1056; GFX12-NEXT: s_mul_i32 s1, s1, 5 1057; GFX12-NEXT: s_wait_alu 0xfffe 1058; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2 1059; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 1060; GFX12-NEXT: s_wait_dscnt 0x0 1061; GFX12-NEXT: global_inv scope:SCOPE_SE 1062; GFX12-NEXT: .LBB7_2: 1063; GFX12-NEXT: s_wait_alu 0xfffe 1064; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1065; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1066; GFX12-NEXT: s_wait_kmcnt 0x0 1067; GFX12-NEXT: v_readfirstlane_b32 s2, v1 1068; GFX12-NEXT: v_mov_b32_e32 v1, 0 1069; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 1070; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1071; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] 1072; GFX12-NEXT: s_wait_storecnt 0x0 1073; GFX12-NEXT: s_endpgm 1074 %val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst 1075 store i32 %val, ptr addrspace(1) %out 1076 ret void 1077} 1078 1079declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) 1080 1081; from atomic_optimizations_buffer.ll 1082; covers buffer_atomic (atomic with return) 1083; 1084define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1085; GFX9-LABEL: add_i32_constant: 1086; GFX9: ; %bb.0: ; %entry 1087; GFX9-NEXT: s_mov_b64 s[2:3], exec 1088; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1089; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1090; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1091; GFX9-NEXT: ; implicit-def: $vgpr1 1092; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1093; GFX9-NEXT: s_cbranch_execz .LBB8_2 1094; GFX9-NEXT: ; %bb.1: 1095; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1096; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1098; GFX9-NEXT: s_mul_i32 s2, s2, 5 1099; GFX9-NEXT: v_mov_b32_e32 v1, s2 1100; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 1101; GFX9-NEXT: s_waitcnt vmcnt(0) 1102; GFX9-NEXT: .LBB8_2: 1103; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1104; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1107; GFX9-NEXT: v_mov_b32_e32 v2, 0 1108; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1109; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1110; GFX9-NEXT: s_waitcnt vmcnt(0) 1111; GFX9-NEXT: s_endpgm 1112; 1113; GFX90A-LABEL: add_i32_constant: 1114; GFX90A: ; %bb.0: ; %entry 1115; GFX90A-NEXT: s_mov_b64 s[2:3], exec 1116; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1117; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1118; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1119; GFX90A-NEXT: ; implicit-def: $vgpr1 1120; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc 1121; GFX90A-NEXT: s_cbranch_execz .LBB8_2 1122; GFX90A-NEXT: ; %bb.1: 1123; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1124; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1126; GFX90A-NEXT: s_mul_i32 s2, s2, 5 1127; GFX90A-NEXT: v_mov_b32_e32 v1, s2 1128; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 1129; GFX90A-NEXT: s_waitcnt vmcnt(0) 1130; GFX90A-NEXT: .LBB8_2: 1131; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] 1132; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1133; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 1135; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1136; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1137; GFX90A-NEXT: global_store_dword v2, v0, s[0:1] 1138; GFX90A-NEXT: s_waitcnt vmcnt(0) 1139; GFX90A-NEXT: s_endpgm 1140; 1141; GFX10-LABEL: add_i32_constant: 1142; GFX10: ; %bb.0: ; %entry 1143; GFX10-NEXT: s_mov_b32 s1, exec_lo 1144; GFX10-NEXT: ; implicit-def: $vgpr1 1145; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1146; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1147; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo 1148; GFX10-NEXT: s_cbranch_execz .LBB8_2 1149; GFX10-NEXT: ; %bb.1: 1150; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1151; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1152; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 1153; GFX10-NEXT: s_mul_i32 s1, s1, 5 1154; GFX10-NEXT: v_mov_b32_e32 v1, s1 1155; GFX10-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 1156; GFX10-NEXT: s_waitcnt vmcnt(0) 1157; GFX10-NEXT: .LBB8_2: 1158; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1159; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 1160; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1161; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX10-NEXT: v_readfirstlane_b32 s2, v1 1163; GFX10-NEXT: v_mov_b32_e32 v1, 0 1164; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1165; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1166; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1167; GFX10-NEXT: s_endpgm 1168; 1169; GFX9-FLATSCR-LABEL: add_i32_constant: 1170; GFX9-FLATSCR: ; %bb.0: ; %entry 1171; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec 1172; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1173; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1174; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1175; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 1176; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 1177; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 1178; GFX9-FLATSCR-NEXT: ; %bb.1: 1179; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1180; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1181; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1182; GFX9-FLATSCR-NEXT: s_mul_i32 s2, s2, 5 1183; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s2 1184; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 1185; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 1186; GFX9-FLATSCR-NEXT: .LBB8_2: 1187; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 1188; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1189; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1190; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 1191; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 1192; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1193; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1] 1194; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 1195; GFX9-FLATSCR-NEXT: s_endpgm 1196; 1197; GFX11-LABEL: add_i32_constant: 1198; GFX11: ; %bb.0: ; %entry 1199; GFX11-NEXT: s_mov_b32 s1, exec_lo 1200; GFX11-NEXT: s_mov_b32 s0, exec_lo 1201; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1202; GFX11-NEXT: ; implicit-def: $vgpr1 1203; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1204; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 1205; GFX11-NEXT: s_cbranch_execz .LBB8_2 1206; GFX11-NEXT: ; %bb.1: 1207; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1208; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 1210; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1211; GFX11-NEXT: s_mul_i32 s1, s1, 5 1212; GFX11-NEXT: v_mov_b32_e32 v1, s1 1213; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 1214; GFX11-NEXT: s_waitcnt vmcnt(0) 1215; GFX11-NEXT: .LBB8_2: 1216; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1217; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1218; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX11-NEXT: v_readfirstlane_b32 s2, v1 1220; GFX11-NEXT: v_mov_b32_e32 v1, 0 1221; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1222; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1223; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1224; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1225; GFX11-NEXT: s_endpgm 1226; 1227; GFX12-LABEL: add_i32_constant: 1228; GFX12: ; %bb.0: ; %entry 1229; GFX12-NEXT: s_mov_b32 s1, exec_lo 1230; GFX12-NEXT: s_mov_b32 s0, exec_lo 1231; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1232; GFX12-NEXT: ; implicit-def: $vgpr1 1233; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1234; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 1235; GFX12-NEXT: s_cbranch_execz .LBB8_2 1236; GFX12-NEXT: ; %bb.1: 1237; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1238; GFX12-NEXT: s_wait_kmcnt 0x0 1239; GFX12-NEXT: s_wait_alu 0xfffe 1240; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 1241; GFX12-NEXT: s_wait_alu 0xfffe 1242; GFX12-NEXT: s_mul_i32 s1, s1, 5 1243; GFX12-NEXT: s_wait_alu 0xfffe 1244; GFX12-NEXT: v_mov_b32_e32 v1, s1 1245; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1246; GFX12-NEXT: s_wait_loadcnt 0x0 1247; GFX12-NEXT: .LBB8_2: 1248; GFX12-NEXT: s_wait_alu 0xfffe 1249; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1250; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1251; GFX12-NEXT: s_wait_kmcnt 0x0 1252; GFX12-NEXT: v_readfirstlane_b32 s2, v1 1253; GFX12-NEXT: v_mov_b32_e32 v1, 0 1254; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 1255; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 1256; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] 1257; GFX12-NEXT: s_wait_storecnt 0x0 1258; GFX12-NEXT: s_endpgm 1259entry: 1260 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 1261 store i32 %old, ptr addrspace(1) %out 1262 ret void 1263} 1264 1265declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) 1266 1267; from llvm.amdgcn.image.load.a16.ll 1268; covers image_load 1269; 1270define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { 1271; GFX9-LABEL: load.f32.1d: 1272; GFX9: ; %bb.0: ; %main_body 1273; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 1274; GFX9-NEXT: s_waitcnt vmcnt(0) 1275; GFX9-NEXT: ; return to shader part epilog 1276; 1277; GFX90A-LABEL: load.f32.1d: 1278; GFX90A: ; %bb.0: ; %main_body 1279; GFX90A-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 1280; GFX90A-NEXT: s_waitcnt vmcnt(0) 1281; GFX90A-NEXT: ; return to shader part epilog 1282; 1283; GFX10-LABEL: load.f32.1d: 1284; GFX10: ; %bb.0: ; %main_body 1285; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 1286; GFX10-NEXT: s_waitcnt vmcnt(0) 1287; GFX10-NEXT: ; return to shader part epilog 1288; 1289; GFX9-FLATSCR-LABEL: load.f32.1d: 1290; GFX9-FLATSCR: ; %bb.0: ; %main_body 1291; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9 1292; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8 1293; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7 1294; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6 1295; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5 1296; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4 1297; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3 1298; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2 1299; GFX9-FLATSCR-NEXT: image_load v0, v0, s[4:11] dmask:0x1 unorm a16 1300; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 1301; GFX9-FLATSCR-NEXT: ; return to shader part epilog 1302; 1303; GFX11-LABEL: load.f32.1d: 1304; GFX11: ; %bb.0: ; %main_body 1305; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 1306; GFX11-NEXT: s_waitcnt vmcnt(0) 1307; GFX11-NEXT: ; return to shader part epilog 1308; 1309; GFX12-LABEL: load.f32.1d: 1310; GFX12: ; %bb.0: ; %main_body 1311; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 1312; GFX12-NEXT: s_wait_loadcnt 0x0 1313; GFX12-NEXT: ; return to shader part epilog 1314main_body: 1315 %x = extractelement <2 x i16> %coords, i32 0 1316 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) 1317 ret <4 x float> %v 1318} 1319 1320declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32) 1321 1322; from llvm.amdgcn.image.store.a16.ll 1323; covers image_store 1324; 1325define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) { 1326; GFX9-LABEL: store_f32_1d: 1327; GFX9: ; %bb.0: ; %main_body 1328; GFX9-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16 1329; GFX9-NEXT: s_waitcnt vmcnt(0) 1330; GFX9-NEXT: s_endpgm 1331; 1332; GFX90A-LABEL: store_f32_1d: 1333; GFX90A: ; %bb.0: ; %main_body 1334; GFX90A-NEXT: v_mov_b32_e32 v5, v4 1335; GFX90A-NEXT: v_mov_b32_e32 v4, v3 1336; GFX90A-NEXT: v_mov_b32_e32 v3, v2 1337; GFX90A-NEXT: v_mov_b32_e32 v2, v1 1338; GFX90A-NEXT: image_store v[2:5], v0, s[0:7] dmask:0x1 unorm a16 1339; GFX90A-NEXT: s_waitcnt vmcnt(0) 1340; GFX90A-NEXT: s_endpgm 1341; 1342; GFX10-LABEL: store_f32_1d: 1343; GFX10: ; %bb.0: ; %main_body 1344; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 1345; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1346; GFX10-NEXT: s_endpgm 1347; 1348; GFX9-FLATSCR-LABEL: store_f32_1d: 1349; GFX9-FLATSCR: ; %bb.0: ; %main_body 1350; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9 1351; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8 1352; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7 1353; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6 1354; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5 1355; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4 1356; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3 1357; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2 1358; GFX9-FLATSCR-NEXT: image_store v[1:4], v0, s[4:11] dmask:0x1 unorm a16 1359; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 1360; GFX9-FLATSCR-NEXT: s_endpgm 1361; 1362; GFX11-LABEL: store_f32_1d: 1363; GFX11: ; %bb.0: ; %main_body 1364; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 1365; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1366; GFX11-NEXT: s_endpgm 1367; 1368; GFX12-LABEL: store_f32_1d: 1369; GFX12: ; %bb.0: ; %main_body 1370; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 1371; GFX12-NEXT: s_wait_storecnt 0x0 1372; GFX12-NEXT: s_endpgm 1373 1374main_body: 1375 %x = extractelement <2 x i16> %coords, i32 0 1376 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) 1377 ret void 1378} 1379 1380declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) 1381 1382; from llvm.amdgcn.image.atomic.dim.ll 1383; covers image_atomic (atomic with return) 1384; 1385define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { 1386; GFX9-LABEL: atomic_swap_1d: 1387; GFX9: ; %bb.0: ; %main_body 1388; GFX9-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc 1389; GFX9-NEXT: s_waitcnt vmcnt(0) 1390; GFX9-NEXT: ; return to shader part epilog 1391; 1392; GFX90A-LABEL: atomic_swap_1d: 1393; GFX90A: ; %bb.0: ; %main_body 1394; GFX90A-NEXT: v_mov_b32_e32 v2, v1 1395; GFX90A-NEXT: image_atomic_swap v0, v2, s[0:7] dmask:0x1 unorm glc 1396; GFX90A-NEXT: s_waitcnt vmcnt(0) 1397; GFX90A-NEXT: ; return to shader part epilog 1398; 1399; GFX10-LABEL: atomic_swap_1d: 1400; GFX10: ; %bb.0: ; %main_body 1401; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc 1402; GFX10-NEXT: s_waitcnt vmcnt(0) 1403; GFX10-NEXT: ; return to shader part epilog 1404; 1405; GFX9-FLATSCR-LABEL: atomic_swap_1d: 1406; GFX9-FLATSCR: ; %bb.0: ; %main_body 1407; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9 1408; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8 1409; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7 1410; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6 1411; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5 1412; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4 1413; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3 1414; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2 1415; GFX9-FLATSCR-NEXT: image_atomic_swap v0, v1, s[4:11] dmask:0x1 unorm glc 1416; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) 1417; GFX9-FLATSCR-NEXT: ; return to shader part epilog 1418; 1419; GFX11-LABEL: atomic_swap_1d: 1420; GFX11: ; %bb.0: ; %main_body 1421; GFX11-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc 1422; GFX11-NEXT: s_waitcnt vmcnt(0) 1423; GFX11-NEXT: ; return to shader part epilog 1424; 1425; GFX12-LABEL: atomic_swap_1d: 1426; GFX12: ; %bb.0: ; %main_body 1427; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN 1428; GFX12-NEXT: s_wait_loadcnt 0x0 1429; GFX12-NEXT: ; return to shader part epilog 1430main_body: 1431 %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) 1432 %out = bitcast i32 %v to float 1433 ret float %out 1434} 1435 1436; from lds-bounds.ll 1437; covers ds_write_b64 (atomic without return) 1438@compute_lds = external addrspace(3) global [512 x i32], align 16 1439; 1440define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 { 1441; GFX9-LABEL: store_aligned: 1442; GFX9: ; %bb.0: ; %entry 1443; GFX9-NEXT: v_mov_b32_e32 v1, 42 1444; GFX9-NEXT: v_mov_b32_e32 v2, 43 1445; GFX9-NEXT: ds_write_b64 v0, v[1:2] 1446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1447; GFX9-NEXT: s_endpgm 1448; 1449; GFX90A-LABEL: store_aligned: 1450; GFX90A: ; %bb.0: ; %entry 1451; GFX90A-NEXT: v_mov_b32_e32 v2, 42 1452; GFX90A-NEXT: v_mov_b32_e32 v3, 43 1453; GFX90A-NEXT: ds_write_b64 v0, v[2:3] 1454; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1455; GFX90A-NEXT: s_endpgm 1456; 1457; GFX10-LABEL: store_aligned: 1458; GFX10: ; %bb.0: ; %entry 1459; GFX10-NEXT: v_mov_b32_e32 v1, 42 1460; GFX10-NEXT: v_mov_b32_e32 v2, 43 1461; GFX10-NEXT: ds_write_b64 v0, v[1:2] 1462; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1463; GFX10-NEXT: s_endpgm 1464; 1465; GFX9-FLATSCR-LABEL: store_aligned: 1466; GFX9-FLATSCR: ; %bb.0: ; %entry 1467; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 42 1468; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 43 1469; GFX9-FLATSCR-NEXT: ds_write_b64 v0, v[1:2] 1470; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1471; GFX9-FLATSCR-NEXT: s_endpgm 1472; 1473; GFX11-LABEL: store_aligned: 1474; GFX11: ; %bb.0: ; %entry 1475; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43 1476; GFX11-NEXT: ds_store_b64 v0, v[1:2] 1477; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX11-NEXT: s_endpgm 1479; 1480; GFX12-LABEL: store_aligned: 1481; GFX12: ; %bb.0: ; %entry 1482; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43 1483; GFX12-NEXT: ds_store_b64 v0, v[1:2] 1484; GFX12-NEXT: s_wait_dscnt 0x0 1485; GFX12-NEXT: s_endpgm 1486entry: 1487 %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1 1488 1489 store i32 42, ptr addrspace(3) %ptr, align 8 1490 store i32 43, ptr addrspace(3) %ptr.gep.1 1491 ret void 1492} 1493 1494 1495; from lds-bounds.ll 1496; covers ds_read_b64 1497; 1498define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 { 1499; GFX9-LABEL: load_aligned: 1500; GFX9: ; %bb.0: ; %entry 1501; GFX9-NEXT: ds_read_b64 v[0:1], v0 1502; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX9-NEXT: ; return to shader part epilog 1504; 1505; GFX90A-LABEL: load_aligned: 1506; GFX90A: ; %bb.0: ; %entry 1507; GFX90A-NEXT: ds_read_b64 v[0:1], v0 1508; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX90A-NEXT: ; return to shader part epilog 1510; 1511; GFX10-LABEL: load_aligned: 1512; GFX10: ; %bb.0: ; %entry 1513; GFX10-NEXT: ds_read_b64 v[0:1], v0 1514; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX10-NEXT: ; return to shader part epilog 1516; 1517; GFX9-FLATSCR-LABEL: load_aligned: 1518; GFX9-FLATSCR: ; %bb.0: ; %entry 1519; GFX9-FLATSCR-NEXT: ds_read_b64 v[0:1], v0 1520; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1521; GFX9-FLATSCR-NEXT: ; return to shader part epilog 1522; 1523; GFX11-LABEL: load_aligned: 1524; GFX11: ; %bb.0: ; %entry 1525; GFX11-NEXT: ds_load_b64 v[0:1], v0 1526; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX11-NEXT: ; return to shader part epilog 1528; 1529; GFX12-LABEL: load_aligned: 1530; GFX12: ; %bb.0: ; %entry 1531; GFX12-NEXT: ds_load_b64 v[0:1], v0 1532; GFX12-NEXT: s_wait_dscnt 0x0 1533; GFX12-NEXT: ; return to shader part epilog 1534entry: 1535 %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1 1536 1537 %v.0 = load i32, ptr addrspace(3) %ptr, align 8 1538 %v.1 = load i32, ptr addrspace(3) %ptr.gep.1 1539 1540 %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0 1541 %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1 1542 %bc = bitcast <2 x i32> %r.1 to <2 x float> 1543 ret <2 x float> %bc 1544} 1545 1546; from lds-bounds.ll 1547; covers ds_write2_b32 1548; 1549define amdgpu_cs void @store_global_const_idx() #0 { 1550; GFX9-LABEL: store_global_const_idx: 1551; GFX9: ; %bb.0: ; %entry 1552; GFX9-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1553; GFX9-NEXT: v_mov_b32_e32 v1, 42 1554; GFX9-NEXT: v_mov_b32_e32 v2, 43 1555; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 1556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1557; GFX9-NEXT: s_endpgm 1558; 1559; GFX90A-LABEL: store_global_const_idx: 1560; GFX90A: ; %bb.0: ; %entry 1561; GFX90A-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1562; GFX90A-NEXT: v_mov_b32_e32 v1, 42 1563; GFX90A-NEXT: v_mov_b32_e32 v2, 43 1564; GFX90A-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 1565; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1566; GFX90A-NEXT: s_endpgm 1567; 1568; GFX10-LABEL: store_global_const_idx: 1569; GFX10: ; %bb.0: ; %entry 1570; GFX10-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1571; GFX10-NEXT: v_mov_b32_e32 v1, 42 1572; GFX10-NEXT: v_mov_b32_e32 v2, 43 1573; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 1574; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1575; GFX10-NEXT: s_endpgm 1576; 1577; GFX9-FLATSCR-LABEL: store_global_const_idx: 1578; GFX9-FLATSCR: ; %bb.0: ; %entry 1579; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1580; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 42 1581; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 43 1582; GFX9-FLATSCR-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 1583; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1584; GFX9-FLATSCR-NEXT: s_endpgm 1585; 1586; GFX11-LABEL: store_global_const_idx: 1587; GFX11: ; %bb.0: ; %entry 1588; GFX11-NEXT: v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42 1589; GFX11-NEXT: v_mov_b32_e32 v2, 43 1590; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 1591; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1592; GFX11-NEXT: s_endpgm 1593; 1594; GFX12-LABEL: store_global_const_idx: 1595; GFX12: ; %bb.0: ; %entry 1596; GFX12-NEXT: v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42 1597; GFX12-NEXT: v_mov_b32_e32 v2, 43 1598; GFX12-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 1599; GFX12-NEXT: s_wait_dscnt 0x0 1600; GFX12-NEXT: s_endpgm 1601entry: 1602 %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 1603 %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4 1604 1605 store i32 42, ptr addrspace(3) %ptr.a 1606 store i32 43, ptr addrspace(3) %ptr.b 1607 ret void 1608} 1609 1610; from lds-bounds.ll 1611; covers ds_read2_b32 1612; 1613define amdgpu_cs <2 x float> @load_global_const_idx() #0 { 1614; GFX9-LABEL: load_global_const_idx: 1615; GFX9: ; %bb.0: ; %entry 1616; GFX9-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1617; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 1618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX9-NEXT: ; return to shader part epilog 1620; 1621; GFX90A-LABEL: load_global_const_idx: 1622; GFX90A: ; %bb.0: ; %entry 1623; GFX90A-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1624; GFX90A-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 1625; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1626; GFX90A-NEXT: ; return to shader part epilog 1627; 1628; GFX10-LABEL: load_global_const_idx: 1629; GFX10: ; %bb.0: ; %entry 1630; GFX10-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1631; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 1632; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1633; GFX10-NEXT: ; return to shader part epilog 1634; 1635; GFX9-FLATSCR-LABEL: load_global_const_idx: 1636; GFX9-FLATSCR: ; %bb.0: ; %entry 1637; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1638; GFX9-FLATSCR-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 1639; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 1640; GFX9-FLATSCR-NEXT: ; return to shader part epilog 1641; 1642; GFX11-LABEL: load_global_const_idx: 1643; GFX11: ; %bb.0: ; %entry 1644; GFX11-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1645; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 1646; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1647; GFX11-NEXT: ; return to shader part epilog 1648; 1649; GFX12-LABEL: load_global_const_idx: 1650; GFX12: ; %bb.0: ; %entry 1651; GFX12-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo 1652; GFX12-NEXT: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 1653; GFX12-NEXT: s_wait_dscnt 0x0 1654; GFX12-NEXT: ; return to shader part epilog 1655entry: 1656 %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 1657 %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4 1658 1659 %v.0 = load i32, ptr addrspace(3) %ptr.a 1660 %v.1 = load i32, ptr addrspace(3) %ptr.b 1661 1662 %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0 1663 %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1 1664 %bc = bitcast <2 x i32> %r.1 to <2 x float> 1665 ret <2 x float> %bc 1666} 1667