1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12 %s 6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-PAL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX940 %s 8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 10; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX11-PAL %s 11; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX12-PAL %s 12 13define amdgpu_kernel void @zero_init_kernel() { 14; GFX9-LABEL: zero_init_kernel: 15; GFX9: ; %bb.0: 16; GFX9-NEXT: s_mov_b32 s0, 0 17; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 18; GFX9-NEXT: s_mov_b32 s1, s0 19; GFX9-NEXT: s_mov_b32 s2, s0 20; GFX9-NEXT: s_mov_b32 s3, s0 21; GFX9-NEXT: v_mov_b32_e32 v0, s0 22; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 23; GFX9-NEXT: v_mov_b32_e32 v1, s1 24; GFX9-NEXT: v_mov_b32_e32 v2, s2 25; GFX9-NEXT: v_mov_b32_e32 v3, s3 26; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 28; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 29; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 30; GFX9-NEXT: s_endpgm 31; 32; GFX10-LABEL: zero_init_kernel: 33; GFX10: ; %bb.0: 34; GFX10-NEXT: s_add_u32 s8, s8, s13 35; GFX10-NEXT: s_addc_u32 s9, s9, 0 36; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 37; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 38; GFX10-NEXT: s_mov_b32 s0, 0 39; GFX10-NEXT: s_mov_b32 s1, s0 40; GFX10-NEXT: s_mov_b32 s2, s0 41; GFX10-NEXT: s_mov_b32 s3, s0 42; GFX10-NEXT: v_mov_b32_e32 v0, s0 43; GFX10-NEXT: v_mov_b32_e32 v1, s1 44; GFX10-NEXT: v_mov_b32_e32 v2, s2 45; GFX10-NEXT: v_mov_b32_e32 v3, s3 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 49; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off 50; GFX10-NEXT: s_endpgm 51; 52; GFX11-LABEL: zero_init_kernel: 53; GFX11: ; %bb.0: 54; GFX11-NEXT: s_mov_b32 s0, 0 55; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 56; GFX11-NEXT: s_mov_b32 s1, s0 57; GFX11-NEXT: s_mov_b32 s2, s0 58; GFX11-NEXT: s_mov_b32 s3, s0 59; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 60; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 61; GFX11-NEXT: s_clause 0x3 62; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 63; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 64; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 65; GFX11-NEXT: scratch_store_b128 off, v[0:3], off 66; GFX11-NEXT: s_endpgm 67; 68; GFX12-LABEL: zero_init_kernel: 69; GFX12: ; %bb.0: 70; GFX12-NEXT: s_mov_b32 s0, 0 71; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 72; GFX12-NEXT: s_mov_b32 s1, s0 73; GFX12-NEXT: s_mov_b32 s2, s0 74; GFX12-NEXT: s_mov_b32 s3, s0 75; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 76; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 77; GFX12-NEXT: s_clause 0x3 78; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:48 79; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32 80; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16 81; GFX12-NEXT: scratch_store_b128 off, v[0:3], off 82; GFX12-NEXT: s_endpgm 83; 84; GFX9-PAL-LABEL: zero_init_kernel: 85; GFX9-PAL: ; %bb.0: 86; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 87; GFX9-PAL-NEXT: s_mov_b32 s12, s0 88; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 89; GFX9-PAL-NEXT: s_mov_b32 s0, 0 90; GFX9-PAL-NEXT: s_mov_b32 s1, s0 91; GFX9-PAL-NEXT: s_mov_b32 s2, s0 92; GFX9-PAL-NEXT: s_mov_b32 s3, s0 93; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 94; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 95; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 96; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 97; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 98; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 99; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 100; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 101; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 102; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 103; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 104; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 105; GFX9-PAL-NEXT: s_endpgm 106; 107; GFX940-LABEL: zero_init_kernel: 108; GFX940: ; %bb.0: 109; GFX940-NEXT: s_mov_b32 s0, 0 110; GFX940-NEXT: s_mov_b32 s1, s0 111; GFX940-NEXT: s_mov_b32 s2, s0 112; GFX940-NEXT: s_mov_b32 s3, s0 113; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 114; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 115; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1 116; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 117; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1 118; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 119; GFX940-NEXT: s_endpgm 120; 121; GFX1010-PAL-LABEL: zero_init_kernel: 122; GFX1010-PAL: ; %bb.0: 123; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 124; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 125; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 126; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 128; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 129; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 130; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 131; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 132; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 133; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 134; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 135; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 136; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 137; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 138; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 139; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 140; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 141; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 142; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 143; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 144; GFX1010-PAL-NEXT: s_endpgm 145; 146; GFX1030-PAL-LABEL: zero_init_kernel: 147; GFX1030-PAL: ; %bb.0: 148; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 149; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 150; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 151; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 152; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 153; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 154; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 155; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 156; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 157; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 158; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 159; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 160; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 161; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 162; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 163; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 164; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 165; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 166; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 167; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 168; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off 169; GFX1030-PAL-NEXT: s_endpgm 170; 171; GFX11-PAL-LABEL: zero_init_kernel: 172; GFX11-PAL: ; %bb.0: 173; GFX11-PAL-NEXT: s_mov_b32 s0, 0 174; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 175; GFX11-PAL-NEXT: s_mov_b32 s1, s0 176; GFX11-PAL-NEXT: s_mov_b32 s2, s0 177; GFX11-PAL-NEXT: s_mov_b32 s3, s0 178; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 179; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 180; GFX11-PAL-NEXT: s_clause 0x3 181; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 182; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 183; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 184; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off 185; GFX11-PAL-NEXT: s_endpgm 186; 187; GFX12-PAL-LABEL: zero_init_kernel: 188; GFX12-PAL: ; %bb.0: 189; GFX12-PAL-NEXT: s_mov_b32 s0, 0 190; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 191; GFX12-PAL-NEXT: s_mov_b32 s1, s0 192; GFX12-PAL-NEXT: s_mov_b32 s2, s0 193; GFX12-PAL-NEXT: s_mov_b32 s3, s0 194; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 195; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 196; GFX12-PAL-NEXT: s_clause 0x3 197; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 198; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 199; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 200; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off 201; GFX12-PAL-NEXT: s_endpgm 202 %alloca = alloca [32 x i16], align 2, addrspace(5) 203 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) 204 ret void 205} 206 207define void @zero_init_foo() { 208; GFX9-LABEL: zero_init_foo: 209; GFX9: ; %bb.0: 210; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; GFX9-NEXT: s_mov_b32 s0, 0 212; GFX9-NEXT: s_mov_b32 s1, s0 213; GFX9-NEXT: s_mov_b32 s2, s0 214; GFX9-NEXT: s_mov_b32 s3, s0 215; GFX9-NEXT: v_mov_b32_e32 v0, s0 216; GFX9-NEXT: v_mov_b32_e32 v1, s1 217; GFX9-NEXT: v_mov_b32_e32 v2, s2 218; GFX9-NEXT: v_mov_b32_e32 v3, s3 219; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 220; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 221; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 222; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 223; GFX9-NEXT: s_waitcnt vmcnt(0) 224; GFX9-NEXT: s_setpc_b64 s[30:31] 225; 226; GFX10-LABEL: zero_init_foo: 227; GFX10: ; %bb.0: 228; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; GFX10-NEXT: s_mov_b32 s0, 0 230; GFX10-NEXT: s_mov_b32 s1, s0 231; GFX10-NEXT: s_mov_b32 s2, s0 232; GFX10-NEXT: s_mov_b32 s3, s0 233; GFX10-NEXT: v_mov_b32_e32 v0, s0 234; GFX10-NEXT: v_mov_b32_e32 v1, s1 235; GFX10-NEXT: v_mov_b32_e32 v2, s2 236; GFX10-NEXT: v_mov_b32_e32 v3, s3 237; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 238; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 239; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 240; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 241; GFX10-NEXT: s_setpc_b64 s[30:31] 242; 243; GFX11-LABEL: zero_init_foo: 244; GFX11: ; %bb.0: 245; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX11-NEXT: s_mov_b32 s0, 0 247; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 248; GFX11-NEXT: s_mov_b32 s1, s0 249; GFX11-NEXT: s_mov_b32 s2, s0 250; GFX11-NEXT: s_mov_b32 s3, s0 251; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 252; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 253; GFX11-NEXT: s_clause 0x3 254; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 255; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 256; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 257; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 258; GFX11-NEXT: s_setpc_b64 s[30:31] 259; 260; GFX12-LABEL: zero_init_foo: 261; GFX12: ; %bb.0: 262; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 263; GFX12-NEXT: s_wait_expcnt 0x0 264; GFX12-NEXT: s_wait_samplecnt 0x0 265; GFX12-NEXT: s_wait_bvhcnt 0x0 266; GFX12-NEXT: s_wait_kmcnt 0x0 267; GFX12-NEXT: s_mov_b32 s0, 0 268; GFX12-NEXT: s_wait_alu 0xfffe 269; GFX12-NEXT: s_mov_b32 s1, s0 270; GFX12-NEXT: s_mov_b32 s2, s0 271; GFX12-NEXT: s_mov_b32 s3, s0 272; GFX12-NEXT: s_wait_alu 0xfffe 273; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 274; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 275; GFX12-NEXT: s_clause 0x3 276; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 277; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 278; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 279; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 280; GFX12-NEXT: s_setpc_b64 s[30:31] 281; 282; GFX9-PAL-LABEL: zero_init_foo: 283; GFX9-PAL: ; %bb.0: 284; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 285; GFX9-PAL-NEXT: s_mov_b32 s0, 0 286; GFX9-PAL-NEXT: s_mov_b32 s1, s0 287; GFX9-PAL-NEXT: s_mov_b32 s2, s0 288; GFX9-PAL-NEXT: s_mov_b32 s3, s0 289; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 290; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 291; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 292; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 293; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 294; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 295; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 296; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 297; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 298; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 299; 300; GFX940-LABEL: zero_init_foo: 301; GFX940: ; %bb.0: 302; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 303; GFX940-NEXT: s_mov_b32 s0, 0 304; GFX940-NEXT: s_mov_b32 s1, s0 305; GFX940-NEXT: s_mov_b32 s2, s0 306; GFX940-NEXT: s_mov_b32 s3, s0 307; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 308; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 309; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 sc0 sc1 310; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 sc0 sc1 311; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 sc0 sc1 312; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 sc0 sc1 313; GFX940-NEXT: s_waitcnt vmcnt(0) 314; GFX940-NEXT: s_setpc_b64 s[30:31] 315; 316; GFX10-PAL-LABEL: zero_init_foo: 317; GFX10-PAL: ; %bb.0: 318; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GFX10-PAL-NEXT: s_mov_b32 s0, 0 320; GFX10-PAL-NEXT: s_mov_b32 s1, s0 321; GFX10-PAL-NEXT: s_mov_b32 s2, s0 322; GFX10-PAL-NEXT: s_mov_b32 s3, s0 323; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 324; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 325; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 326; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 327; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 328; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 329; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 330; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 331; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 332; 333; GFX11-PAL-LABEL: zero_init_foo: 334; GFX11-PAL: ; %bb.0: 335; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; GFX11-PAL-NEXT: s_mov_b32 s0, 0 337; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 338; GFX11-PAL-NEXT: s_mov_b32 s1, s0 339; GFX11-PAL-NEXT: s_mov_b32 s2, s0 340; GFX11-PAL-NEXT: s_mov_b32 s3, s0 341; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 342; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 343; GFX11-PAL-NEXT: s_clause 0x3 344; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 345; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 346; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 347; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 348; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 349; 350; GFX12-PAL-LABEL: zero_init_foo: 351; GFX12-PAL: ; %bb.0: 352; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 353; GFX12-PAL-NEXT: s_wait_expcnt 0x0 354; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 355; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 356; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 357; GFX12-PAL-NEXT: s_mov_b32 s0, 0 358; GFX12-PAL-NEXT: s_wait_alu 0xfffe 359; GFX12-PAL-NEXT: s_mov_b32 s1, s0 360; GFX12-PAL-NEXT: s_mov_b32 s2, s0 361; GFX12-PAL-NEXT: s_mov_b32 s3, s0 362; GFX12-PAL-NEXT: s_wait_alu 0xfffe 363; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 364; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 365; GFX12-PAL-NEXT: s_clause 0x3 366; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 367; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 368; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 369; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 370; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 371 %alloca = alloca [32 x i16], align 2, addrspace(5) 372 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) 373 ret void 374} 375 376define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 377; GFX9-LABEL: store_load_sindex_kernel: 378; GFX9: ; %bb.0: ; %bb 379; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 380; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 381; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 382; GFX9-NEXT: v_mov_b32_e32 v0, 15 383; GFX9-NEXT: s_waitcnt lgkmcnt(0) 384; GFX9-NEXT: s_lshl_b32 s1, s0, 2 385; GFX9-NEXT: s_and_b32 s0, s0, 15 386; GFX9-NEXT: s_lshl_b32 s0, s0, 2 387; GFX9-NEXT: scratch_store_dword off, v0, s1 388; GFX9-NEXT: s_waitcnt vmcnt(0) 389; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 390; GFX9-NEXT: s_waitcnt vmcnt(0) 391; GFX9-NEXT: s_endpgm 392; 393; GFX10-LABEL: store_load_sindex_kernel: 394; GFX10: ; %bb.0: ; %bb 395; GFX10-NEXT: s_add_u32 s8, s8, s13 396; GFX10-NEXT: s_addc_u32 s9, s9, 0 397; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 398; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 399; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 400; GFX10-NEXT: v_mov_b32_e32 v0, 15 401; GFX10-NEXT: s_waitcnt lgkmcnt(0) 402; GFX10-NEXT: s_and_b32 s1, s0, 15 403; GFX10-NEXT: s_lshl_b32 s0, s0, 2 404; GFX10-NEXT: s_lshl_b32 s1, s1, 2 405; GFX10-NEXT: scratch_store_dword off, v0, s0 406; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 407; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 408; GFX10-NEXT: s_waitcnt vmcnt(0) 409; GFX10-NEXT: s_endpgm 410; 411; GFX11-LABEL: store_load_sindex_kernel: 412; GFX11: ; %bb.0: ; %bb 413; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 414; GFX11-NEXT: v_mov_b32_e32 v0, 15 415; GFX11-NEXT: s_waitcnt lgkmcnt(0) 416; GFX11-NEXT: s_and_b32 s1, s0, 15 417; GFX11-NEXT: s_lshl_b32 s0, s0, 2 418; GFX11-NEXT: s_lshl_b32 s1, s1, 2 419; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 420; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 421; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 422; GFX11-NEXT: s_waitcnt vmcnt(0) 423; GFX11-NEXT: s_endpgm 424; 425; GFX12-LABEL: store_load_sindex_kernel: 426; GFX12: ; %bb.0: ; %bb 427; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 428; GFX12-NEXT: v_mov_b32_e32 v0, 15 429; GFX12-NEXT: s_wait_kmcnt 0x0 430; GFX12-NEXT: s_and_b32 s1, s0, 15 431; GFX12-NEXT: s_lshl_b32 s0, s0, 2 432; GFX12-NEXT: s_lshl_b32 s1, s1, 2 433; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 434; GFX12-NEXT: s_wait_storecnt 0x0 435; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 436; GFX12-NEXT: s_wait_loadcnt 0x0 437; GFX12-NEXT: s_endpgm 438; 439; GFX9-PAL-LABEL: store_load_sindex_kernel: 440; GFX9-PAL: ; %bb.0: ; %bb 441; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 442; GFX9-PAL-NEXT: s_mov_b32 s12, s0 443; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 444; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 445; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 446; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 448; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 449; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 450; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 451; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 452; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 453; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 454; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 455; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 456; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 457; GFX9-PAL-NEXT: s_endpgm 458; 459; GFX940-LABEL: store_load_sindex_kernel: 460; GFX940: ; %bb.0: ; %bb 461; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 462; GFX940-NEXT: v_mov_b32_e32 v0, 15 463; GFX940-NEXT: s_waitcnt lgkmcnt(0) 464; GFX940-NEXT: s_lshl_b32 s1, s0, 2 465; GFX940-NEXT: s_and_b32 s0, s0, 15 466; GFX940-NEXT: s_lshl_b32 s0, s0, 2 467; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 468; GFX940-NEXT: s_waitcnt vmcnt(0) 469; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 470; GFX940-NEXT: s_waitcnt vmcnt(0) 471; GFX940-NEXT: s_endpgm 472; 473; GFX10-PAL-LABEL: store_load_sindex_kernel: 474; GFX10-PAL: ; %bb.0: ; %bb 475; GFX10-PAL-NEXT: s_getpc_b64 s[12:13] 476; GFX10-PAL-NEXT: s_mov_b32 s12, s0 477; GFX10-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 478; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 479; GFX10-PAL-NEXT: s_and_b32 s13, s13, 0xffff 480; GFX10-PAL-NEXT: s_add_u32 s12, s12, s11 481; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 482; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 483; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 484; GFX10-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 485; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 486; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 487; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 488; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 489; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 490; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 491; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 492; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 493; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 494; GFX10-PAL-NEXT: s_endpgm 495; 496; GFX11-PAL-LABEL: store_load_sindex_kernel: 497; GFX11-PAL: ; %bb.0: ; %bb 498; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 499; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 500; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 501; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 502; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 503; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 504; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 505; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 506; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 507; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 508; GFX11-PAL-NEXT: s_endpgm 509; 510; GFX12-PAL-LABEL: store_load_sindex_kernel: 511; GFX12-PAL: ; %bb.0: ; %bb 512; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 513; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 514; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 515; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 516; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 517; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 518; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 519; GFX12-PAL-NEXT: s_wait_storecnt 0x0 520; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 521; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 522; GFX12-PAL-NEXT: s_endpgm 523bb: 524 %i = alloca [32 x float], align 4, addrspace(5) 525 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 526 store volatile i32 15, ptr addrspace(5) %i7, align 4 527 %i9 = and i32 %idx, 15 528 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 529 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 530 ret void 531} 532 533define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 534; GFX9-LABEL: store_load_sindex_foo: 535; GFX9: ; %bb.0: ; %bb 536; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 537; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 538; GFX9-NEXT: s_lshl_b32 s0, s2, 2 539; GFX9-NEXT: v_mov_b32_e32 v0, 15 540; GFX9-NEXT: scratch_store_dword off, v0, s0 541; GFX9-NEXT: s_waitcnt vmcnt(0) 542; GFX9-NEXT: s_and_b32 s0, s2, 15 543; GFX9-NEXT: s_lshl_b32 s0, s0, 2 544; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 545; GFX9-NEXT: s_waitcnt vmcnt(0) 546; GFX9-NEXT: s_endpgm 547; 548; GFX10-LABEL: store_load_sindex_foo: 549; GFX10: ; %bb.0: ; %bb 550; GFX10-NEXT: s_add_u32 s0, s0, s3 551; GFX10-NEXT: s_addc_u32 s1, s1, 0 552; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 553; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 554; GFX10-NEXT: v_mov_b32_e32 v0, 15 555; GFX10-NEXT: s_and_b32 s0, s2, 15 556; GFX10-NEXT: s_lshl_b32 s1, s2, 2 557; GFX10-NEXT: s_lshl_b32 s0, s0, 2 558; GFX10-NEXT: scratch_store_dword off, v0, s1 559; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 560; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 561; GFX10-NEXT: s_waitcnt vmcnt(0) 562; GFX10-NEXT: s_endpgm 563; 564; GFX11-LABEL: store_load_sindex_foo: 565; GFX11: ; %bb.0: ; %bb 566; GFX11-NEXT: v_mov_b32_e32 v0, 15 567; GFX11-NEXT: s_and_b32 s1, s0, 15 568; GFX11-NEXT: s_lshl_b32 s0, s0, 2 569; GFX11-NEXT: s_lshl_b32 s1, s1, 2 570; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 571; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 572; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 573; GFX11-NEXT: s_waitcnt vmcnt(0) 574; GFX11-NEXT: s_endpgm 575; 576; GFX12-LABEL: store_load_sindex_foo: 577; GFX12: ; %bb.0: ; %bb 578; GFX12-NEXT: v_mov_b32_e32 v0, 15 579; GFX12-NEXT: s_and_b32 s1, s0, 15 580; GFX12-NEXT: s_lshl_b32 s0, s0, 2 581; GFX12-NEXT: s_lshl_b32 s1, s1, 2 582; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 583; GFX12-NEXT: s_wait_storecnt 0x0 584; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 585; GFX12-NEXT: s_wait_loadcnt 0x0 586; GFX12-NEXT: s_endpgm 587; 588; GFX9-PAL-LABEL: store_load_sindex_foo: 589; GFX9-PAL: ; %bb.0: ; %bb 590; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 591; GFX9-PAL-NEXT: s_mov_b32 s2, s0 592; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 593; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 594; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 595; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 596; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 597; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 598; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 599; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 600; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 601; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 602; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 603; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 604; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 605; GFX9-PAL-NEXT: s_endpgm 606; 607; GFX940-LABEL: store_load_sindex_foo: 608; GFX940: ; %bb.0: ; %bb 609; GFX940-NEXT: s_lshl_b32 s1, s0, 2 610; GFX940-NEXT: v_mov_b32_e32 v0, 15 611; GFX940-NEXT: s_and_b32 s0, s0, 15 612; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 613; GFX940-NEXT: s_waitcnt vmcnt(0) 614; GFX940-NEXT: s_lshl_b32 s0, s0, 2 615; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 616; GFX940-NEXT: s_waitcnt vmcnt(0) 617; GFX940-NEXT: s_endpgm 618; 619; GFX10-PAL-LABEL: store_load_sindex_foo: 620; GFX10-PAL: ; %bb.0: ; %bb 621; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 622; GFX10-PAL-NEXT: s_mov_b32 s2, s0 623; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 624; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 625; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 626; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 627; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 628; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 629; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 630; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 631; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 632; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 633; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 634; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 635; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 636; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 637; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 638; GFX10-PAL-NEXT: s_endpgm 639; 640; GFX11-PAL-LABEL: store_load_sindex_foo: 641; GFX11-PAL: ; %bb.0: ; %bb 642; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 643; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 644; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 645; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 646; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 647; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 648; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 649; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 650; GFX11-PAL-NEXT: s_endpgm 651; 652; GFX12-PAL-LABEL: store_load_sindex_foo: 653; GFX12-PAL: ; %bb.0: ; %bb 654; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 655; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 656; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 657; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 658; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 659; GFX12-PAL-NEXT: s_wait_storecnt 0x0 660; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 661; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 662; GFX12-PAL-NEXT: s_endpgm 663bb: 664 %i = alloca [32 x float], align 4, addrspace(5) 665 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 666 store volatile i32 15, ptr addrspace(5) %i7, align 4 667 %i9 = and i32 %idx, 15 668 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 669 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 670 ret void 671} 672 673define amdgpu_kernel void @store_load_vindex_kernel() { 674; GFX9-LABEL: store_load_vindex_kernel: 675; GFX9: ; %bb.0: ; %bb 676; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 677; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 678; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 679; GFX9-NEXT: v_mov_b32_e32 v1, v0 680; GFX9-NEXT: v_mov_b32_e32 v2, 15 681; GFX9-NEXT: scratch_store_dword v1, v2, off 682; GFX9-NEXT: s_waitcnt vmcnt(0) 683; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 684; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 685; GFX9-NEXT: s_waitcnt vmcnt(0) 686; GFX9-NEXT: s_endpgm 687; 688; GFX10-LABEL: store_load_vindex_kernel: 689; GFX10: ; %bb.0: ; %bb 690; GFX10-NEXT: s_add_u32 s8, s8, s13 691; GFX10-NEXT: s_addc_u32 s9, s9, 0 692; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 693; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 694; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 695; GFX10-NEXT: v_mov_b32_e32 v2, 15 696; GFX10-NEXT: v_mov_b32_e32 v1, v0 697; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 698; GFX10-NEXT: scratch_store_dword v1, v2, off 699; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 700; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 701; GFX10-NEXT: s_waitcnt vmcnt(0) 702; GFX10-NEXT: s_endpgm 703; 704; GFX11-LABEL: store_load_vindex_kernel: 705; GFX11: ; %bb.0: ; %bb 706; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 708; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 709; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 710; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc 711; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 712; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 713; GFX11-NEXT: s_waitcnt vmcnt(0) 714; GFX11-NEXT: s_endpgm 715; 716; GFX12-LABEL: store_load_vindex_kernel: 717; GFX12: ; %bb.0: ; %bb 718; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 719; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 720; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 721; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0, v0 722; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS 723; GFX12-NEXT: s_wait_storecnt 0x0 724; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS 725; GFX12-NEXT: s_wait_loadcnt 0x0 726; GFX12-NEXT: s_endpgm 727; 728; GFX9-PAL-LABEL: store_load_vindex_kernel: 729; GFX9-PAL: ; %bb.0: ; %bb 730; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 731; GFX9-PAL-NEXT: s_mov_b32 s12, s0 732; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 733; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 734; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 735; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 736; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 737; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 738; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 739; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 740; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 741; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 742; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 743; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 744; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 745; GFX9-PAL-NEXT: s_endpgm 746; 747; GFX940-LABEL: store_load_vindex_kernel: 748; GFX940: ; %bb.0: ; %bb 749; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 750; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 751; GFX940-NEXT: v_mov_b32_e32 v1, 15 752; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 753; GFX940-NEXT: s_waitcnt vmcnt(0) 754; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 755; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 756; GFX940-NEXT: s_waitcnt vmcnt(0) 757; GFX940-NEXT: s_endpgm 758; 759; GFX10-PAL-LABEL: store_load_vindex_kernel: 760; GFX10-PAL: ; %bb.0: ; %bb 761; GFX10-PAL-NEXT: s_getpc_b64 s[12:13] 762; GFX10-PAL-NEXT: s_mov_b32 s12, s0 763; GFX10-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 764; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 765; GFX10-PAL-NEXT: s_and_b32 s13, s13, 0xffff 766; GFX10-PAL-NEXT: s_add_u32 s12, s12, s11 767; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 768; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 769; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 770; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 771; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 772; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 773; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 0, v0 774; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off 775; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 776; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 777; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 778; GFX10-PAL-NEXT: s_endpgm 779; 780; GFX11-PAL-LABEL: store_load_vindex_kernel: 781; GFX11-PAL: ; %bb.0: ; %bb 782; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 783; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 784; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 785; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 786; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc 787; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 788; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 789; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 790; GFX11-PAL-NEXT: s_endpgm 791; 792; GFX12-PAL-LABEL: store_load_vindex_kernel: 793; GFX12-PAL: ; %bb.0: ; %bb 794; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 795; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 796; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 797; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 798; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS 799; GFX12-PAL-NEXT: s_wait_storecnt 0x0 800; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS 801; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 802; GFX12-PAL-NEXT: s_endpgm 803bb: 804 %i = alloca [32 x float], align 4, addrspace(5) 805 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 806 %i3 = zext i32 %i2 to i64 807 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 808 store volatile i32 15, ptr addrspace(5) %i7, align 4 809 %i9 = sub nsw i32 31, %i2 810 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 811 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 812 ret void 813} 814 815define void @store_load_vindex_foo(i32 %idx) { 816; GFX9-LABEL: store_load_vindex_foo: 817; GFX9: ; %bb.0: ; %bb 818; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 819; GFX9-NEXT: v_mov_b32_e32 v1, s32 820; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 821; GFX9-NEXT: v_mov_b32_e32 v3, 15 822; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 823; GFX9-NEXT: scratch_store_dword v2, v3, off 824; GFX9-NEXT: s_waitcnt vmcnt(0) 825; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 826; GFX9-NEXT: scratch_load_dword v0, v0, off glc 827; GFX9-NEXT: s_waitcnt vmcnt(0) 828; GFX9-NEXT: s_setpc_b64 s[30:31] 829; 830; GFX10-LABEL: store_load_vindex_foo: 831; GFX10: ; %bb.0: ; %bb 832; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 833; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 834; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 835; GFX10-NEXT: v_mov_b32_e32 v2, 15 836; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 837; GFX10-NEXT: scratch_store_dword v0, v2, off 838; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 839; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 840; GFX10-NEXT: s_waitcnt vmcnt(0) 841; GFX10-NEXT: s_setpc_b64 s[30:31] 842; 843; GFX11-LABEL: store_load_vindex_foo: 844; GFX11: ; %bb.0: ; %bb 845; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 846; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 847; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s32 848; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 849; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 850; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc 851; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 852; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 853; GFX11-NEXT: s_waitcnt vmcnt(0) 854; GFX11-NEXT: s_setpc_b64 s[30:31] 855; 856; GFX12-LABEL: store_load_vindex_foo: 857; GFX12: ; %bb.0: ; %bb 858; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 859; GFX12-NEXT: s_wait_expcnt 0x0 860; GFX12-NEXT: s_wait_samplecnt 0x0 861; GFX12-NEXT: s_wait_bvhcnt 0x0 862; GFX12-NEXT: s_wait_kmcnt 0x0 863; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 864; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 865; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 866; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 867; GFX12-NEXT: s_wait_storecnt 0x0 868; GFX12-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS 869; GFX12-NEXT: s_wait_storecnt 0x0 870; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS 871; GFX12-NEXT: s_wait_loadcnt 0x0 872; GFX12-NEXT: s_setpc_b64 s[30:31] 873; 874; GFX9-PAL-LABEL: store_load_vindex_foo: 875; GFX9-PAL: ; %bb.0: ; %bb 876; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 877; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 878; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 879; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 880; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 881; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 882; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 883; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 884; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 885; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 886; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 887; 888; GFX940-LABEL: store_load_vindex_foo: 889; GFX940: ; %bb.0: ; %bb 890; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 891; GFX940-NEXT: v_mov_b32_e32 v1, s32 892; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 893; GFX940-NEXT: v_mov_b32_e32 v2, 15 894; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 895; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 896; GFX940-NEXT: s_waitcnt vmcnt(0) 897; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 898; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 899; GFX940-NEXT: s_waitcnt vmcnt(0) 900; GFX940-NEXT: s_setpc_b64 s[30:31] 901; 902; GFX10-PAL-LABEL: store_load_vindex_foo: 903; GFX10-PAL: ; %bb.0: ; %bb 904; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 905; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 906; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 907; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 908; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 909; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 910; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 911; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 912; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 913; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 914; 915; GFX11-PAL-LABEL: store_load_vindex_foo: 916; GFX11-PAL: ; %bb.0: ; %bb 917; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 918; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 919; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 920; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) 921; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 922; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc 923; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 924; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 925; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 926; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 927; 928; GFX12-PAL-LABEL: store_load_vindex_foo: 929; GFX12-PAL: ; %bb.0: ; %bb 930; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 931; GFX12-PAL-NEXT: s_wait_expcnt 0x0 932; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 933; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 934; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 935; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 936; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 937; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) 938; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 939; GFX12-PAL-NEXT: s_wait_storecnt 0x0 940; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS 941; GFX12-PAL-NEXT: s_wait_storecnt 0x0 942; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS 943; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 944; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 945bb: 946 %i = alloca [32 x float], align 4, addrspace(5) 947 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 948 store volatile i32 15, ptr addrspace(5) %i7, align 4 949 %i9 = and i32 %idx, 15 950 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 951 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 952 ret void 953} 954 955define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { 956; GFX9-LABEL: private_ptr_foo: 957; GFX9: ; %bb.0: 958; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 959; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 960; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 961; GFX9-NEXT: s_waitcnt vmcnt(0) 962; GFX9-NEXT: s_setpc_b64 s[30:31] 963; 964; GFX10-LABEL: private_ptr_foo: 965; GFX10: ; %bb.0: 966; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 967; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 968; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 969; GFX10-NEXT: s_setpc_b64 s[30:31] 970; 971; GFX11-LABEL: private_ptr_foo: 972; GFX11: ; %bb.0: 973; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 974; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 975; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 976; GFX11-NEXT: s_setpc_b64 s[30:31] 977; 978; GFX12-LABEL: private_ptr_foo: 979; GFX12: ; %bb.0: 980; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 981; GFX12-NEXT: s_wait_expcnt 0x0 982; GFX12-NEXT: s_wait_samplecnt 0x0 983; GFX12-NEXT: s_wait_bvhcnt 0x0 984; GFX12-NEXT: s_wait_kmcnt 0x0 985; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000 986; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 987; GFX12-NEXT: s_setpc_b64 s[30:31] 988; 989; GFX9-PAL-LABEL: private_ptr_foo: 990; GFX9-PAL: ; %bb.0: 991; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 992; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 993; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 994; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 995; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 996; 997; GFX940-LABEL: private_ptr_foo: 998; GFX940: ; %bb.0: 999; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1000; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 1001; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 1002; GFX940-NEXT: s_waitcnt vmcnt(0) 1003; GFX940-NEXT: s_setpc_b64 s[30:31] 1004; 1005; GFX10-PAL-LABEL: private_ptr_foo: 1006; GFX10-PAL: ; %bb.0: 1007; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1008; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 1009; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 1010; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1011; 1012; GFX11-PAL-LABEL: private_ptr_foo: 1013; GFX11-PAL: ; %bb.0: 1014; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1015; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 1016; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 1017; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1018; 1019; GFX12-PAL-LABEL: private_ptr_foo: 1020; GFX12-PAL: ; %bb.0: 1021; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 1022; GFX12-PAL-NEXT: s_wait_expcnt 0x0 1023; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 1024; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 1025; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 1026; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 1027; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 1028; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 1029 %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1 1030 store float 1.000000e+01, ptr addrspace(5) %gep, align 4 1031 ret void 1032} 1033 1034define amdgpu_kernel void @zero_init_small_offset_kernel() { 1035; GFX9-LABEL: zero_init_small_offset_kernel: 1036; GFX9: ; %bb.0: 1037; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 1038; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 1039; GFX9-NEXT: s_mov_b32 s0, 0 1040; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1041; GFX9-NEXT: s_waitcnt vmcnt(0) 1042; GFX9-NEXT: s_mov_b32 s1, s0 1043; GFX9-NEXT: s_mov_b32 s2, s0 1044; GFX9-NEXT: s_mov_b32 s3, s0 1045; GFX9-NEXT: v_mov_b32_e32 v0, s0 1046; GFX9-NEXT: v_mov_b32_e32 v1, s1 1047; GFX9-NEXT: v_mov_b32_e32 v2, s2 1048; GFX9-NEXT: v_mov_b32_e32 v3, s3 1049; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 1050; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 1051; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 1052; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 1053; GFX9-NEXT: s_endpgm 1054; 1055; GFX10-LABEL: zero_init_small_offset_kernel: 1056; GFX10: ; %bb.0: 1057; GFX10-NEXT: s_add_u32 s8, s8, s13 1058; GFX10-NEXT: s_addc_u32 s9, s9, 0 1059; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 1060; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 1061; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc 1062; GFX10-NEXT: s_waitcnt vmcnt(0) 1063; GFX10-NEXT: s_mov_b32 s0, 0 1064; GFX10-NEXT: s_mov_b32 s1, s0 1065; GFX10-NEXT: s_mov_b32 s2, s0 1066; GFX10-NEXT: s_mov_b32 s3, s0 1067; GFX10-NEXT: v_mov_b32_e32 v0, s0 1068; GFX10-NEXT: v_mov_b32_e32 v1, s1 1069; GFX10-NEXT: v_mov_b32_e32 v2, s2 1070; GFX10-NEXT: v_mov_b32_e32 v3, s3 1071; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 1072; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1073; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1074; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1075; GFX10-NEXT: s_endpgm 1076; 1077; GFX11-LABEL: zero_init_small_offset_kernel: 1078; GFX11: ; %bb.0: 1079; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc 1080; GFX11-NEXT: s_waitcnt vmcnt(0) 1081; GFX11-NEXT: s_mov_b32 s0, 0 1082; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1083; GFX11-NEXT: s_mov_b32 s1, s0 1084; GFX11-NEXT: s_mov_b32 s2, s0 1085; GFX11-NEXT: s_mov_b32 s3, s0 1086; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1087; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1088; GFX11-NEXT: s_clause 0x3 1089; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:256 1090; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1091; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1092; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1093; GFX11-NEXT: s_endpgm 1094; 1095; GFX12-LABEL: zero_init_small_offset_kernel: 1096; GFX12: ; %bb.0: 1097; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 1098; GFX12-NEXT: s_wait_loadcnt 0x0 1099; GFX12-NEXT: s_mov_b32 s0, 0 1100; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1101; GFX12-NEXT: s_mov_b32 s1, s0 1102; GFX12-NEXT: s_mov_b32 s2, s0 1103; GFX12-NEXT: s_mov_b32 s3, s0 1104; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1105; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1106; GFX12-NEXT: s_clause 0x3 1107; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:256 1108; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1109; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1110; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1111; GFX12-NEXT: s_endpgm 1112; 1113; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 1114; GFX9-PAL: ; %bb.0: 1115; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 1116; GFX9-PAL-NEXT: s_mov_b32 s12, s0 1117; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1118; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1119; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1120; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1121; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1122; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1123; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1124; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 1125; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 1126; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1127; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1128; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1129; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1130; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1131; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1132; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 1133; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 1134; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 1135; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 1136; GFX9-PAL-NEXT: s_endpgm 1137; 1138; GFX940-LABEL: zero_init_small_offset_kernel: 1139; GFX940: ; %bb.0: 1140; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 1141; GFX940-NEXT: s_waitcnt vmcnt(0) 1142; GFX940-NEXT: s_mov_b32 s0, 0 1143; GFX940-NEXT: s_mov_b32 s1, s0 1144; GFX940-NEXT: s_mov_b32 s2, s0 1145; GFX940-NEXT: s_mov_b32 s3, s0 1146; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1147; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1148; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1 1149; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1 1150; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1 1151; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1 1152; GFX940-NEXT: s_endpgm 1153; 1154; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 1155; GFX1010-PAL: ; %bb.0: 1156; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 1157; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 1158; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1159; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1161; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 1162; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 1163; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 1164; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 1165; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1166; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc 1167; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1168; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1169; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1170; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1171; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1172; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1173; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1174; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1175; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 1176; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 1177; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 1178; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 1179; GFX1010-PAL-NEXT: s_endpgm 1180; 1181; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 1182; GFX1030-PAL: ; %bb.0: 1183; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 1184; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 1185; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1186; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1188; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 1189; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 1190; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 1191; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 1192; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc 1193; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1194; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1195; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1196; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1197; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1198; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1199; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1200; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1201; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1202; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 1203; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1204; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1205; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1206; GFX1030-PAL-NEXT: s_endpgm 1207; 1208; GFX11-PAL-LABEL: zero_init_small_offset_kernel: 1209; GFX11-PAL: ; %bb.0: 1210; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc 1211; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1212; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1213; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1214; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1215; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1216; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1217; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1218; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1219; GFX11-PAL-NEXT: s_clause 0x3 1220; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:256 1221; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1222; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1223; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1224; GFX11-PAL-NEXT: s_endpgm 1225; 1226; GFX12-PAL-LABEL: zero_init_small_offset_kernel: 1227; GFX12-PAL: ; %bb.0: 1228; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 1229; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 1230; GFX12-PAL-NEXT: s_mov_b32 s0, 0 1231; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1232; GFX12-PAL-NEXT: s_mov_b32 s1, s0 1233; GFX12-PAL-NEXT: s_mov_b32 s2, s0 1234; GFX12-PAL-NEXT: s_mov_b32 s3, s0 1235; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1236; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1237; GFX12-PAL-NEXT: s_clause 0x3 1238; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:256 1239; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1240; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1241; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1242; GFX12-PAL-NEXT: s_endpgm 1243 %padding = alloca [64 x i32], align 4, addrspace(5) 1244 %alloca = alloca [32 x i16], align 2, addrspace(5) 1245 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 1246 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 1247 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) 1248 ret void 1249} 1250 1251define void @zero_init_small_offset_foo() { 1252; GFX9-LABEL: zero_init_small_offset_foo: 1253; GFX9: ; %bb.0: 1254; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1255; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 1256; GFX9-NEXT: s_waitcnt vmcnt(0) 1257; GFX9-NEXT: s_mov_b32 s0, 0 1258; GFX9-NEXT: s_mov_b32 s1, s0 1259; GFX9-NEXT: s_mov_b32 s2, s0 1260; GFX9-NEXT: s_mov_b32 s3, s0 1261; GFX9-NEXT: v_mov_b32_e32 v0, s0 1262; GFX9-NEXT: v_mov_b32_e32 v1, s1 1263; GFX9-NEXT: v_mov_b32_e32 v2, s2 1264; GFX9-NEXT: v_mov_b32_e32 v3, s3 1265; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1266; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1267; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1268; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1269; GFX9-NEXT: s_waitcnt vmcnt(0) 1270; GFX9-NEXT: s_setpc_b64 s[30:31] 1271; 1272; GFX10-LABEL: zero_init_small_offset_foo: 1273; GFX10: ; %bb.0: 1274; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1275; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 1276; GFX10-NEXT: s_waitcnt vmcnt(0) 1277; GFX10-NEXT: s_mov_b32 s0, 0 1278; GFX10-NEXT: s_mov_b32 s1, s0 1279; GFX10-NEXT: s_mov_b32 s2, s0 1280; GFX10-NEXT: s_mov_b32 s3, s0 1281; GFX10-NEXT: v_mov_b32_e32 v0, s0 1282; GFX10-NEXT: v_mov_b32_e32 v1, s1 1283; GFX10-NEXT: v_mov_b32_e32 v2, s2 1284; GFX10-NEXT: v_mov_b32_e32 v3, s3 1285; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1286; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1287; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1288; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1289; GFX10-NEXT: s_setpc_b64 s[30:31] 1290; 1291; GFX11-LABEL: zero_init_small_offset_foo: 1292; GFX11: ; %bb.0: 1293; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1294; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1295; GFX11-NEXT: s_waitcnt vmcnt(0) 1296; GFX11-NEXT: s_mov_b32 s0, 0 1297; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1298; GFX11-NEXT: s_mov_b32 s1, s0 1299; GFX11-NEXT: s_mov_b32 s2, s0 1300; GFX11-NEXT: s_mov_b32 s3, s0 1301; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1302; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1303; GFX11-NEXT: s_clause 0x3 1304; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1305; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1306; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1307; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1308; GFX11-NEXT: s_setpc_b64 s[30:31] 1309; 1310; GFX12-LABEL: zero_init_small_offset_foo: 1311; GFX12: ; %bb.0: 1312; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1313; GFX12-NEXT: s_wait_expcnt 0x0 1314; GFX12-NEXT: s_wait_samplecnt 0x0 1315; GFX12-NEXT: s_wait_bvhcnt 0x0 1316; GFX12-NEXT: s_wait_kmcnt 0x0 1317; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS 1318; GFX12-NEXT: s_wait_loadcnt 0x0 1319; GFX12-NEXT: s_mov_b32 s0, 0 1320; GFX12-NEXT: s_wait_alu 0xfffe 1321; GFX12-NEXT: s_mov_b32 s1, s0 1322; GFX12-NEXT: s_mov_b32 s2, s0 1323; GFX12-NEXT: s_mov_b32 s3, s0 1324; GFX12-NEXT: s_wait_alu 0xfffe 1325; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1326; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1327; GFX12-NEXT: s_clause 0x3 1328; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1329; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1330; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1331; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1332; GFX12-NEXT: s_setpc_b64 s[30:31] 1333; 1334; GFX9-PAL-LABEL: zero_init_small_offset_foo: 1335; GFX9-PAL: ; %bb.0: 1336; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1337; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 1338; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1339; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1340; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1341; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1342; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1343; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1344; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1345; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1346; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1347; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1348; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1349; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1350; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1351; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1352; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1353; 1354; GFX940-LABEL: zero_init_small_offset_foo: 1355; GFX940: ; %bb.0: 1356; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1357; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1358; GFX940-NEXT: s_waitcnt vmcnt(0) 1359; GFX940-NEXT: s_mov_b32 s0, 0 1360; GFX940-NEXT: s_mov_b32 s1, s0 1361; GFX940-NEXT: s_mov_b32 s2, s0 1362; GFX940-NEXT: s_mov_b32 s3, s0 1363; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1364; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1365; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 sc0 sc1 1366; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 sc0 sc1 1367; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 sc0 sc1 1368; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 sc0 sc1 1369; GFX940-NEXT: s_waitcnt vmcnt(0) 1370; GFX940-NEXT: s_setpc_b64 s[30:31] 1371; 1372; GFX10-PAL-LABEL: zero_init_small_offset_foo: 1373; GFX10-PAL: ; %bb.0: 1374; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1375; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1376; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1377; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1378; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1379; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1380; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1381; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1382; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1383; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1384; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1385; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1386; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1387; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1388; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1389; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1390; 1391; GFX11-PAL-LABEL: zero_init_small_offset_foo: 1392; GFX11-PAL: ; %bb.0: 1393; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1394; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1395; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1396; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1397; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1398; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1399; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1400; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1401; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1402; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1403; GFX11-PAL-NEXT: s_clause 0x3 1404; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1405; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1406; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1407; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1408; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1409; 1410; GFX12-PAL-LABEL: zero_init_small_offset_foo: 1411; GFX12-PAL: ; %bb.0: 1412; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 1413; GFX12-PAL-NEXT: s_wait_expcnt 0x0 1414; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 1415; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 1416; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 1417; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS 1418; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 1419; GFX12-PAL-NEXT: s_mov_b32 s0, 0 1420; GFX12-PAL-NEXT: s_wait_alu 0xfffe 1421; GFX12-PAL-NEXT: s_mov_b32 s1, s0 1422; GFX12-PAL-NEXT: s_mov_b32 s2, s0 1423; GFX12-PAL-NEXT: s_mov_b32 s3, s0 1424; GFX12-PAL-NEXT: s_wait_alu 0xfffe 1425; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1426; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1427; GFX12-PAL-NEXT: s_clause 0x3 1428; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1429; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1430; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1431; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1432; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 1433 %padding = alloca [64 x i32], align 4, addrspace(5) 1434 %alloca = alloca [32 x i16], align 2, addrspace(5) 1435 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 1436 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 1437 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) 1438 ret void 1439} 1440 1441define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 1442; GFX9-LABEL: store_load_sindex_small_offset_kernel: 1443; GFX9: ; %bb.0: ; %bb 1444; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 1445; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 1446; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 1447; GFX9-NEXT: s_mov_b32 s1, 0 1448; GFX9-NEXT: scratch_load_dword v0, off, s1 glc 1449; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1450; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1451; GFX9-NEXT: s_and_b32 s0, s0, 15 1452; GFX9-NEXT: v_mov_b32_e32 v0, 15 1453; GFX9-NEXT: s_addk_i32 s1, 0x100 1454; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1455; GFX9-NEXT: scratch_store_dword off, v0, s1 1456; GFX9-NEXT: s_waitcnt vmcnt(0) 1457; GFX9-NEXT: s_addk_i32 s0, 0x100 1458; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1459; GFX9-NEXT: s_waitcnt vmcnt(0) 1460; GFX9-NEXT: s_endpgm 1461; 1462; GFX10-LABEL: store_load_sindex_small_offset_kernel: 1463; GFX10: ; %bb.0: ; %bb 1464; GFX10-NEXT: s_add_u32 s8, s8, s13 1465; GFX10-NEXT: s_addc_u32 s9, s9, 0 1466; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 1467; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 1468; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 1469; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc 1470; GFX10-NEXT: s_waitcnt vmcnt(0) 1471; GFX10-NEXT: v_mov_b32_e32 v0, 15 1472; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1473; GFX10-NEXT: s_and_b32 s1, s0, 15 1474; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1475; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1476; GFX10-NEXT: s_addk_i32 s0, 0x100 1477; GFX10-NEXT: s_addk_i32 s1, 0x100 1478; GFX10-NEXT: scratch_store_dword off, v0, s0 1479; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1480; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1481; GFX10-NEXT: s_waitcnt vmcnt(0) 1482; GFX10-NEXT: s_endpgm 1483; 1484; GFX11-LABEL: store_load_sindex_small_offset_kernel: 1485; GFX11: ; %bb.0: ; %bb 1486; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 1487; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc 1488; GFX11-NEXT: s_waitcnt vmcnt(0) 1489; GFX11-NEXT: v_mov_b32_e32 v0, 15 1490; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX11-NEXT: s_and_b32 s1, s0, 15 1492; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1493; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1494; GFX11-NEXT: s_addk_i32 s0, 0x100 1495; GFX11-NEXT: s_addk_i32 s1, 0x100 1496; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1497; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1498; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1499; GFX11-NEXT: s_waitcnt vmcnt(0) 1500; GFX11-NEXT: s_endpgm 1501; 1502; GFX12-LABEL: store_load_sindex_small_offset_kernel: 1503; GFX12: ; %bb.0: ; %bb 1504; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 1505; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 1506; GFX12-NEXT: s_wait_loadcnt 0x0 1507; GFX12-NEXT: v_mov_b32_e32 v0, 15 1508; GFX12-NEXT: s_wait_kmcnt 0x0 1509; GFX12-NEXT: s_and_b32 s1, s0, 15 1510; GFX12-NEXT: s_lshl_b32 s0, s0, 2 1511; GFX12-NEXT: s_lshl_b32 s1, s1, 2 1512; GFX12-NEXT: s_addk_co_i32 s0, 0x100 1513; GFX12-NEXT: s_addk_co_i32 s1, 0x100 1514; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 1515; GFX12-NEXT: s_wait_storecnt 0x0 1516; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 1517; GFX12-NEXT: s_wait_loadcnt 0x0 1518; GFX12-NEXT: s_endpgm 1519; 1520; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 1521; GFX9-PAL: ; %bb.0: ; %bb 1522; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 1523; GFX9-PAL-NEXT: s_mov_b32 s12, s0 1524; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1525; GFX9-PAL-NEXT: s_mov_b32 s1, 0 1526; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 1527; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1528; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1529; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 1530; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 1531; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc 1532; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1533; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1534; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1535; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1536; GFX9-PAL-NEXT: s_addk_i32 s1, 0x100 1537; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1538; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1539; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1540; GFX9-PAL-NEXT: s_addk_i32 s0, 0x100 1541; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1542; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1543; GFX9-PAL-NEXT: s_endpgm 1544; 1545; GFX940-LABEL: store_load_sindex_small_offset_kernel: 1546; GFX940: ; %bb.0: ; %bb 1547; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 1548; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 1549; GFX940-NEXT: s_waitcnt vmcnt(0) 1550; GFX940-NEXT: v_mov_b32_e32 v0, 15 1551; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1552; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1553; GFX940-NEXT: s_and_b32 s0, s0, 15 1554; GFX940-NEXT: s_addk_i32 s1, 0x100 1555; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1556; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1557; GFX940-NEXT: s_waitcnt vmcnt(0) 1558; GFX940-NEXT: s_addk_i32 s0, 0x100 1559; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1560; GFX940-NEXT: s_waitcnt vmcnt(0) 1561; GFX940-NEXT: s_endpgm 1562; 1563; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 1564; GFX1010-PAL: ; %bb.0: ; %bb 1565; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 1566; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 1567; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1568; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1569; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1570; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 1571; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 1572; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 1573; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 1574; GFX1010-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 1575; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 1576; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1577; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1578; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1579; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1580; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1581; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1582; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1583; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x100 1584; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x100 1585; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1586; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1587; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1588; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1589; GFX1010-PAL-NEXT: s_endpgm 1590; 1591; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 1592; GFX1030-PAL: ; %bb.0: ; %bb 1593; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 1594; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 1595; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1596; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1597; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1598; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 1599; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 1600; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 1601; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 1602; GFX1030-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 1603; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc 1604; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1605; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1606; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1607; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1608; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1609; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1610; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x100 1611; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x100 1612; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1613; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1614; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1615; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1616; GFX1030-PAL-NEXT: s_endpgm 1617; 1618; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: 1619; GFX11-PAL: ; %bb.0: ; %bb 1620; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 1621; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc 1622; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1623; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1624; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 1625; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1626; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1627; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1628; GFX11-PAL-NEXT: s_addk_i32 s0, 0x100 1629; GFX11-PAL-NEXT: s_addk_i32 s1, 0x100 1630; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1631; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1632; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1633; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1634; GFX11-PAL-NEXT: s_endpgm 1635; 1636; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: 1637; GFX12-PAL: ; %bb.0: ; %bb 1638; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 1639; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 1640; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 1641; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 1642; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 1643; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 1644; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 1645; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 1646; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x100 1647; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x100 1648; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 1649; GFX12-PAL-NEXT: s_wait_storecnt 0x0 1650; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 1651; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 1652; GFX12-PAL-NEXT: s_endpgm 1653bb: 1654 %padding = alloca [64 x i32], align 4, addrspace(5) 1655 %i = alloca [32 x float], align 4, addrspace(5) 1656 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 1657 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 1658 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 1659 store volatile i32 15, ptr addrspace(5) %i7, align 4 1660 %i9 = and i32 %idx, 15 1661 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 1662 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 1663 ret void 1664} 1665 1666define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 1667; GFX9-LABEL: store_load_sindex_small_offset_foo: 1668; GFX9: ; %bb.0: ; %bb 1669; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1670; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1671; GFX9-NEXT: s_mov_b32 s0, 0 1672; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1673; GFX9-NEXT: s_waitcnt vmcnt(0) 1674; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1675; GFX9-NEXT: s_addk_i32 s0, 0x100 1676; GFX9-NEXT: v_mov_b32_e32 v0, 15 1677; GFX9-NEXT: scratch_store_dword off, v0, s0 1678; GFX9-NEXT: s_waitcnt vmcnt(0) 1679; GFX9-NEXT: s_and_b32 s0, s2, 15 1680; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1681; GFX9-NEXT: s_addk_i32 s0, 0x100 1682; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1683; GFX9-NEXT: s_waitcnt vmcnt(0) 1684; GFX9-NEXT: s_endpgm 1685; 1686; GFX10-LABEL: store_load_sindex_small_offset_foo: 1687; GFX10: ; %bb.0: ; %bb 1688; GFX10-NEXT: s_add_u32 s0, s0, s3 1689; GFX10-NEXT: s_addc_u32 s1, s1, 0 1690; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1691; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1692; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc 1693; GFX10-NEXT: s_waitcnt vmcnt(0) 1694; GFX10-NEXT: v_mov_b32_e32 v0, 15 1695; GFX10-NEXT: s_and_b32 s0, s2, 15 1696; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1697; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1698; GFX10-NEXT: s_addk_i32 s1, 0x100 1699; GFX10-NEXT: s_addk_i32 s0, 0x100 1700; GFX10-NEXT: scratch_store_dword off, v0, s1 1701; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1702; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1703; GFX10-NEXT: s_waitcnt vmcnt(0) 1704; GFX10-NEXT: s_endpgm 1705; 1706; GFX11-LABEL: store_load_sindex_small_offset_foo: 1707; GFX11: ; %bb.0: ; %bb 1708; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc 1709; GFX11-NEXT: s_waitcnt vmcnt(0) 1710; GFX11-NEXT: v_mov_b32_e32 v0, 15 1711; GFX11-NEXT: s_and_b32 s1, s0, 15 1712; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1713; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1714; GFX11-NEXT: s_addk_i32 s0, 0x100 1715; GFX11-NEXT: s_addk_i32 s1, 0x100 1716; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1717; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1718; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1719; GFX11-NEXT: s_waitcnt vmcnt(0) 1720; GFX11-NEXT: s_endpgm 1721; 1722; GFX12-LABEL: store_load_sindex_small_offset_foo: 1723; GFX12: ; %bb.0: ; %bb 1724; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 1725; GFX12-NEXT: s_wait_loadcnt 0x0 1726; GFX12-NEXT: v_mov_b32_e32 v0, 15 1727; GFX12-NEXT: s_and_b32 s1, s0, 15 1728; GFX12-NEXT: s_lshl_b32 s0, s0, 2 1729; GFX12-NEXT: s_lshl_b32 s1, s1, 2 1730; GFX12-NEXT: s_addk_co_i32 s0, 0x100 1731; GFX12-NEXT: s_addk_co_i32 s1, 0x100 1732; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 1733; GFX12-NEXT: s_wait_storecnt 0x0 1734; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 1735; GFX12-NEXT: s_wait_loadcnt 0x0 1736; GFX12-NEXT: s_endpgm 1737; 1738; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1739; GFX9-PAL: ; %bb.0: ; %bb 1740; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1741; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1742; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1743; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1744; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1745; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1746; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1747; GFX9-PAL-NEXT: s_mov_b32 s1, 0 1748; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc 1749; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1750; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1751; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1752; GFX9-PAL-NEXT: s_addk_i32 s1, 0x100 1753; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1754; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1755; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1756; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1757; GFX9-PAL-NEXT: s_addk_i32 s0, 0x100 1758; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1759; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1760; GFX9-PAL-NEXT: s_endpgm 1761; 1762; GFX940-LABEL: store_load_sindex_small_offset_foo: 1763; GFX940: ; %bb.0: ; %bb 1764; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 1765; GFX940-NEXT: s_waitcnt vmcnt(0) 1766; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1767; GFX940-NEXT: s_and_b32 s0, s0, 15 1768; GFX940-NEXT: s_addk_i32 s1, 0x100 1769; GFX940-NEXT: v_mov_b32_e32 v0, 15 1770; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1771; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1772; GFX940-NEXT: s_waitcnt vmcnt(0) 1773; GFX940-NEXT: s_addk_i32 s0, 0x100 1774; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1775; GFX940-NEXT: s_waitcnt vmcnt(0) 1776; GFX940-NEXT: s_endpgm 1777; 1778; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1779; GFX1010-PAL: ; %bb.0: ; %bb 1780; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1781; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1782; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1783; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1785; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1786; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1787; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1788; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1789; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 1790; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1791; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1792; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1793; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1794; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1795; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1796; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x100 1797; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x100 1798; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1799; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1800; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1801; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1802; GFX1010-PAL-NEXT: s_endpgm 1803; 1804; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1805; GFX1030-PAL: ; %bb.0: ; %bb 1806; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1807; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1808; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1809; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1811; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1812; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1813; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1814; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1815; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc 1816; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1817; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1818; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1819; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1820; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1821; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x100 1822; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x100 1823; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1824; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1825; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1826; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1827; GFX1030-PAL-NEXT: s_endpgm 1828; 1829; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: 1830; GFX11-PAL: ; %bb.0: ; %bb 1831; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc 1832; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1833; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1834; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1835; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1836; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1837; GFX11-PAL-NEXT: s_addk_i32 s0, 0x100 1838; GFX11-PAL-NEXT: s_addk_i32 s1, 0x100 1839; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1840; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1841; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1842; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1843; GFX11-PAL-NEXT: s_endpgm 1844; 1845; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo: 1846; GFX12-PAL: ; %bb.0: ; %bb 1847; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 1848; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 1849; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 1850; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 1851; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 1852; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 1853; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x100 1854; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x100 1855; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 1856; GFX12-PAL-NEXT: s_wait_storecnt 0x0 1857; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 1858; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 1859; GFX12-PAL-NEXT: s_endpgm 1860bb: 1861 %padding = alloca [64 x i32], align 4, addrspace(5) 1862 %i = alloca [32 x float], align 4, addrspace(5) 1863 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 1864 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 1865 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 1866 store volatile i32 15, ptr addrspace(5) %i7, align 4 1867 %i9 = and i32 %idx, 15 1868 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 1869 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 1870 ret void 1871} 1872 1873define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1874; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1875; GFX9: ; %bb.0: ; %bb 1876; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 1877; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 1878; GFX9-NEXT: s_mov_b32 s0, 0 1879; GFX9-NEXT: scratch_load_dword v1, off, s0 glc 1880; GFX9-NEXT: s_waitcnt vmcnt(0) 1881; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1882; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v0 1883; GFX9-NEXT: v_mov_b32_e32 v2, 15 1884; GFX9-NEXT: scratch_store_dword v1, v2, off 1885; GFX9-NEXT: s_waitcnt vmcnt(0) 1886; GFX9-NEXT: v_sub_u32_e32 v0, 0x100, v0 1887; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1888; GFX9-NEXT: s_waitcnt vmcnt(0) 1889; GFX9-NEXT: s_endpgm 1890; 1891; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1892; GFX10: ; %bb.0: ; %bb 1893; GFX10-NEXT: s_add_u32 s8, s8, s13 1894; GFX10-NEXT: s_addc_u32 s9, s9, 0 1895; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 1896; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 1897; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1898; GFX10-NEXT: v_mov_b32_e32 v2, 15 1899; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc 1900; GFX10-NEXT: s_waitcnt vmcnt(0) 1901; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 1902; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 1903; GFX10-NEXT: scratch_store_dword v1, v2, off 1904; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1905; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1906; GFX10-NEXT: s_waitcnt vmcnt(0) 1907; GFX10-NEXT: s_endpgm 1908; 1909; GFX11-LABEL: store_load_vindex_small_offset_kernel: 1910; GFX11: ; %bb.0: ; %bb 1911; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 1912; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc 1913; GFX11-NEXT: s_waitcnt vmcnt(0) 1914; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 1915; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1916; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 1917; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc 1918; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1919; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1920; GFX11-NEXT: s_waitcnt vmcnt(0) 1921; GFX11-NEXT: s_endpgm 1922; 1923; GFX12-LABEL: store_load_vindex_small_offset_kernel: 1924; GFX12: ; %bb.0: ; %bb 1925; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 1926; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS 1927; GFX12-NEXT: s_wait_loadcnt 0x0 1928; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 1929; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1930; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 1931; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS 1932; GFX12-NEXT: s_wait_storecnt 0x0 1933; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS 1934; GFX12-NEXT: s_wait_loadcnt 0x0 1935; GFX12-NEXT: s_endpgm 1936; 1937; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1938; GFX9-PAL: ; %bb.0: ; %bb 1939; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 1940; GFX9-PAL-NEXT: s_mov_b32 s12, s0 1941; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1942; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1943; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1944; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 1945; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1946; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1947; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 1948; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 1949; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc 1950; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1951; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 1952; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 1953; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1954; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x100, v0 1955; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1956; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1957; GFX9-PAL-NEXT: s_endpgm 1958; 1959; GFX940-LABEL: store_load_vindex_small_offset_kernel: 1960; GFX940: ; %bb.0: ; %bb 1961; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 1962; GFX940-NEXT: s_waitcnt vmcnt(0) 1963; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1964; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 1965; GFX940-NEXT: v_mov_b32_e32 v1, 15 1966; GFX940-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 1967; GFX940-NEXT: s_waitcnt vmcnt(0) 1968; GFX940-NEXT: v_sub_u32_e32 v0, 0x100, v0 1969; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 1970; GFX940-NEXT: s_waitcnt vmcnt(0) 1971; GFX940-NEXT: s_endpgm 1972; 1973; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1974; GFX1010-PAL: ; %bb.0: ; %bb 1975; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 1976; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 1977; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 1978; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 1980; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 1981; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 1982; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 1983; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 1984; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1985; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 1986; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1987; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 glc dlc 1988; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1989; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 1990; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 1991; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 1992; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1993; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1994; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1995; GFX1010-PAL-NEXT: s_endpgm 1996; 1997; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1998; GFX1030-PAL: ; %bb.0: ; %bb 1999; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 2000; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 2001; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2002; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2003; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2004; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 2005; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 2006; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 2007; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 2008; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2009; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 2010; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc 2011; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2012; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 2013; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 2014; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 2015; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2016; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2017; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2018; GFX1030-PAL-NEXT: s_endpgm 2019; 2020; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: 2021; GFX11-PAL: ; %bb.0: ; %bb 2022; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 2023; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc 2024; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2025; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 2026; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2027; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 2028; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc 2029; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2030; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2031; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2032; GFX11-PAL-NEXT: s_endpgm 2033; 2034; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: 2035; GFX12-PAL: ; %bb.0: ; %bb 2036; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 2037; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS 2038; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2039; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 2040; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2041; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 2042; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS 2043; GFX12-PAL-NEXT: s_wait_storecnt 0x0 2044; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS 2045; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2046; GFX12-PAL-NEXT: s_endpgm 2047bb: 2048 %padding = alloca [64 x i32], align 4, addrspace(5) 2049 %i = alloca [32 x float], align 4, addrspace(5) 2050 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 2051 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 2052 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2053 %i3 = zext i32 %i2 to i64 2054 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 2055 store volatile i32 15, ptr addrspace(5) %i7, align 4 2056 %i9 = sub nsw i32 31, %i2 2057 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 2058 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 2059 ret void 2060} 2061 2062define void @store_load_vindex_small_offset_foo(i32 %idx) { 2063; GFX9-LABEL: store_load_vindex_small_offset_foo: 2064; GFX9: ; %bb.0: ; %bb 2065; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2066; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 2067; GFX9-NEXT: s_waitcnt vmcnt(0) 2068; GFX9-NEXT: s_add_i32 s0, s32, 0x100 2069; GFX9-NEXT: v_mov_b32_e32 v1, s0 2070; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2071; GFX9-NEXT: v_mov_b32_e32 v3, 15 2072; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 2073; GFX9-NEXT: scratch_store_dword v2, v3, off 2074; GFX9-NEXT: s_waitcnt vmcnt(0) 2075; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2076; GFX9-NEXT: scratch_load_dword v0, v0, off glc 2077; GFX9-NEXT: s_waitcnt vmcnt(0) 2078; GFX9-NEXT: s_setpc_b64 s[30:31] 2079; 2080; GFX10-LABEL: store_load_vindex_small_offset_foo: 2081; GFX10: ; %bb.0: ; %bb 2082; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2083; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 2084; GFX10-NEXT: s_add_i32 s0, s32, 0x100 2085; GFX10-NEXT: v_mov_b32_e32 v2, 15 2086; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 2087; GFX10-NEXT: s_add_i32 s0, s32, 0x100 2088; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 2089; GFX10-NEXT: s_waitcnt vmcnt(0) 2090; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0 2091; GFX10-NEXT: scratch_store_dword v0, v2, off 2092; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2093; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 2094; GFX10-NEXT: s_waitcnt vmcnt(0) 2095; GFX10-NEXT: s_setpc_b64 s[30:31] 2096; 2097; GFX11-LABEL: store_load_vindex_small_offset_foo: 2098; GFX11: ; %bb.0: ; %bb 2099; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2100; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 2101; GFX11-NEXT: s_add_i32 s0, s32, 0x100 2102; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc 2103; GFX11-NEXT: s_waitcnt vmcnt(0) 2104; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 2105; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 2106; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc 2107; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2108; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 2109; GFX11-NEXT: s_waitcnt vmcnt(0) 2110; GFX11-NEXT: s_setpc_b64 s[30:31] 2111; 2112; GFX12-LABEL: store_load_vindex_small_offset_foo: 2113; GFX12: ; %bb.0: ; %bb 2114; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2115; GFX12-NEXT: s_wait_expcnt 0x0 2116; GFX12-NEXT: s_wait_samplecnt 0x0 2117; GFX12-NEXT: s_wait_bvhcnt 0x0 2118; GFX12-NEXT: s_wait_kmcnt 0x0 2119; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 2120; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2121; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS 2122; GFX12-NEXT: s_wait_loadcnt 0x0 2123; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 2124; GFX12-NEXT: s_wait_storecnt 0x0 2125; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS 2126; GFX12-NEXT: s_wait_storecnt 0x0 2127; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS 2128; GFX12-NEXT: s_wait_loadcnt 0x0 2129; GFX12-NEXT: s_setpc_b64 s[30:31] 2130; 2131; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 2132; GFX9-PAL: ; %bb.0: ; %bb 2133; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2134; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 2135; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2136; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x100 2137; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0 2138; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2139; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 2140; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 2141; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 2142; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2143; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2144; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 2145; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2146; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2147; 2148; GFX940-LABEL: store_load_vindex_small_offset_foo: 2149; GFX940: ; %bb.0: ; %bb 2150; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2151; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 2152; GFX940-NEXT: s_waitcnt vmcnt(0) 2153; GFX940-NEXT: s_add_i32 s0, s32, 0x100 2154; GFX940-NEXT: v_mov_b32_e32 v1, s0 2155; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 2156; GFX940-NEXT: v_mov_b32_e32 v2, 15 2157; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 2158; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 2159; GFX940-NEXT: s_waitcnt vmcnt(0) 2160; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2161; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 2162; GFX940-NEXT: s_waitcnt vmcnt(0) 2163; GFX940-NEXT: s_setpc_b64 s[30:31] 2164; 2165; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 2166; GFX10-PAL: ; %bb.0: ; %bb 2167; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2168; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 2169; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100 2170; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 2171; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 2172; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100 2173; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 2174; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2175; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0 2176; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 2177; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2178; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 2179; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2180; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2181; 2182; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: 2183; GFX11-PAL: ; %bb.0: ; %bb 2184; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2185; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 2186; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x100 2187; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc 2188; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2189; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 2190; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 2191; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc 2192; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2193; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 2194; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2195; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 2196; 2197; GFX12-PAL-LABEL: store_load_vindex_small_offset_foo: 2198; GFX12-PAL: ; %bb.0: ; %bb 2199; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 2200; GFX12-PAL-NEXT: s_wait_expcnt 0x0 2201; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 2202; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 2203; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 2204; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 2205; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2206; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS 2207; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2208; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 2209; GFX12-PAL-NEXT: s_wait_storecnt 0x0 2210; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS 2211; GFX12-PAL-NEXT: s_wait_storecnt 0x0 2212; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS 2213; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2214; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 2215bb: 2216 %padding = alloca [64 x i32], align 4, addrspace(5) 2217 %i = alloca [32 x float], align 4, addrspace(5) 2218 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 2219 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 2220 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 2221 store volatile i32 15, ptr addrspace(5) %i7, align 4 2222 %i9 = and i32 %idx, 15 2223 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 2224 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 2225 ret void 2226} 2227 2228define amdgpu_kernel void @zero_init_large_offset_kernel() { 2229; GFX9-LABEL: zero_init_large_offset_kernel: 2230; GFX9: ; %bb.0: 2231; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 2232; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 2233; GFX9-NEXT: s_mov_b32 s0, 0 2234; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc 2235; GFX9-NEXT: s_waitcnt vmcnt(0) 2236; GFX9-NEXT: s_mov_b32 s1, s0 2237; GFX9-NEXT: s_mov_b32 s2, s0 2238; GFX9-NEXT: s_mov_b32 s3, s0 2239; GFX9-NEXT: v_mov_b32_e32 v0, s0 2240; GFX9-NEXT: v_mov_b32_e32 v1, s1 2241; GFX9-NEXT: v_mov_b32_e32 v2, s2 2242; GFX9-NEXT: v_mov_b32_e32 v3, s3 2243; GFX9-NEXT: s_movk_i32 s0, 0x4004 2244; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2245; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2246; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2247; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2248; GFX9-NEXT: s_endpgm 2249; 2250; GFX10-LABEL: zero_init_large_offset_kernel: 2251; GFX10: ; %bb.0: 2252; GFX10-NEXT: s_add_u32 s8, s8, s13 2253; GFX10-NEXT: s_addc_u32 s9, s9, 0 2254; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 2255; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 2256; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2257; GFX10-NEXT: s_waitcnt vmcnt(0) 2258; GFX10-NEXT: s_mov_b32 s0, 0 2259; GFX10-NEXT: s_mov_b32 s1, s0 2260; GFX10-NEXT: s_mov_b32 s2, s0 2261; GFX10-NEXT: s_mov_b32 s3, s0 2262; GFX10-NEXT: v_mov_b32_e32 v0, s0 2263; GFX10-NEXT: v_mov_b32_e32 v1, s1 2264; GFX10-NEXT: v_mov_b32_e32 v2, s2 2265; GFX10-NEXT: v_mov_b32_e32 v3, s3 2266; GFX10-NEXT: s_movk_i32 s0, 0x4004 2267; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2268; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2269; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2270; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2271; GFX10-NEXT: s_endpgm 2272; 2273; GFX11-LABEL: zero_init_large_offset_kernel: 2274; GFX11: ; %bb.0: 2275; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2276; GFX11-NEXT: s_waitcnt vmcnt(0) 2277; GFX11-NEXT: s_mov_b32 s0, 0 2278; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2279; GFX11-NEXT: s_mov_b32 s1, s0 2280; GFX11-NEXT: s_mov_b32 s2, s0 2281; GFX11-NEXT: s_mov_b32 s3, s0 2282; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2283; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2284; GFX11-NEXT: s_movk_i32 s0, 0x4004 2285; GFX11-NEXT: s_clause 0x3 2286; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 2287; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16 2288; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 2289; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 2290; GFX11-NEXT: s_endpgm 2291; 2292; GFX12-LABEL: zero_init_large_offset_kernel: 2293; GFX12: ; %bb.0: 2294; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 2295; GFX12-NEXT: s_wait_loadcnt 0x0 2296; GFX12-NEXT: s_mov_b32 s0, 0 2297; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2298; GFX12-NEXT: s_mov_b32 s1, s0 2299; GFX12-NEXT: s_mov_b32 s2, s0 2300; GFX12-NEXT: s_mov_b32 s3, s0 2301; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2302; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2303; GFX12-NEXT: s_clause 0x3 2304; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16384 2305; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16400 2306; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16416 2307; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16432 2308; GFX12-NEXT: s_endpgm 2309; 2310; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 2311; GFX9-PAL: ; %bb.0: 2312; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 2313; GFX9-PAL-NEXT: s_mov_b32 s12, s0 2314; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2315; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2316; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2317; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2318; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2319; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2320; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2321; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 2322; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 2323; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc 2324; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2325; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2326; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2327; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2328; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2329; GFX9-PAL-NEXT: s_movk_i32 s0, 0x4004 2330; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2331; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2332; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2333; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2334; GFX9-PAL-NEXT: s_endpgm 2335; 2336; GFX940-LABEL: zero_init_large_offset_kernel: 2337; GFX940: ; %bb.0: 2338; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2339; GFX940-NEXT: s_waitcnt vmcnt(0) 2340; GFX940-NEXT: s_mov_b32 s0, 0 2341; GFX940-NEXT: s_mov_b32 s1, s0 2342; GFX940-NEXT: s_mov_b32 s2, s0 2343; GFX940-NEXT: s_mov_b32 s3, s0 2344; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2345; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2346; GFX940-NEXT: s_movk_i32 s0, 0x4004 2347; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1 2348; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1 2349; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1 2350; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1 2351; GFX940-NEXT: s_endpgm 2352; 2353; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 2354; GFX1010-PAL: ; %bb.0: 2355; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 2356; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 2357; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2358; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2360; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 2361; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 2362; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 2363; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 2364; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2365; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc 2366; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2367; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2368; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2369; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2370; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2371; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2372; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2373; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2374; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x4004 2375; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2376; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2377; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2378; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2379; GFX1010-PAL-NEXT: s_endpgm 2380; 2381; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 2382; GFX1030-PAL: ; %bb.0: 2383; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 2384; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 2385; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2386; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2387; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2388; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 2389; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 2390; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 2391; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 2392; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2393; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2394; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2395; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2396; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2397; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2398; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2399; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2400; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2401; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2402; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x4004 2403; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2404; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2405; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2406; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2407; GFX1030-PAL-NEXT: s_endpgm 2408; 2409; GFX11-PAL-LABEL: zero_init_large_offset_kernel: 2410; GFX11-PAL: ; %bb.0: 2411; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2412; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2413; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2414; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2415; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2416; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2417; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2418; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2419; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2420; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 2421; GFX11-PAL-NEXT: s_clause 0x3 2422; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 2423; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16 2424; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 2425; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 2426; GFX11-PAL-NEXT: s_endpgm 2427; 2428; GFX12-PAL-LABEL: zero_init_large_offset_kernel: 2429; GFX12-PAL: ; %bb.0: 2430; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 2431; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2432; GFX12-PAL-NEXT: s_mov_b32 s0, 0 2433; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2434; GFX12-PAL-NEXT: s_mov_b32 s1, s0 2435; GFX12-PAL-NEXT: s_mov_b32 s2, s0 2436; GFX12-PAL-NEXT: s_mov_b32 s3, s0 2437; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2438; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2439; GFX12-PAL-NEXT: s_clause 0x3 2440; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16384 2441; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16400 2442; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16416 2443; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16432 2444; GFX12-PAL-NEXT: s_endpgm 2445 %padding = alloca [4096 x i32], align 4, addrspace(5) 2446 %alloca = alloca [32 x i16], align 2, addrspace(5) 2447 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 2448 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 2449 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) 2450 ret void 2451} 2452 2453define void @zero_init_large_offset_foo() { 2454; GFX9-LABEL: zero_init_large_offset_foo: 2455; GFX9: ; %bb.0: 2456; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2457; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:4 glc 2458; GFX9-NEXT: s_waitcnt vmcnt(0) 2459; GFX9-NEXT: s_mov_b32 s0, 0 2460; GFX9-NEXT: s_mov_b32 s1, s0 2461; GFX9-NEXT: s_mov_b32 s2, s0 2462; GFX9-NEXT: s_mov_b32 s3, s0 2463; GFX9-NEXT: v_mov_b32_e32 v0, s0 2464; GFX9-NEXT: v_mov_b32_e32 v1, s1 2465; GFX9-NEXT: v_mov_b32_e32 v2, s2 2466; GFX9-NEXT: v_mov_b32_e32 v3, s3 2467; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 2468; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2469; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 2470; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2471; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 2472; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2473; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 2474; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2475; GFX9-NEXT: s_waitcnt vmcnt(0) 2476; GFX9-NEXT: s_setpc_b64 s[30:31] 2477; 2478; GFX10-LABEL: zero_init_large_offset_foo: 2479; GFX10: ; %bb.0: 2480; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2481; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc 2482; GFX10-NEXT: s_waitcnt vmcnt(0) 2483; GFX10-NEXT: s_mov_b32 s0, 0 2484; GFX10-NEXT: s_mov_b32 s1, s0 2485; GFX10-NEXT: s_mov_b32 s2, s0 2486; GFX10-NEXT: s_mov_b32 s3, s0 2487; GFX10-NEXT: v_mov_b32_e32 v0, s0 2488; GFX10-NEXT: v_mov_b32_e32 v1, s1 2489; GFX10-NEXT: v_mov_b32_e32 v2, s2 2490; GFX10-NEXT: v_mov_b32_e32 v3, s3 2491; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 2492; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2493; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 2494; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2495; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 2496; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2497; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 2498; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2499; GFX10-NEXT: s_setpc_b64 s[30:31] 2500; 2501; GFX11-LABEL: zero_init_large_offset_foo: 2502; GFX11: ; %bb.0: 2503; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2504; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc 2505; GFX11-NEXT: s_waitcnt vmcnt(0) 2506; GFX11-NEXT: s_mov_b32 s0, 0 2507; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2508; GFX11-NEXT: s_mov_b32 s1, s0 2509; GFX11-NEXT: s_mov_b32 s2, s0 2510; GFX11-NEXT: s_mov_b32 s3, s0 2511; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2512; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2513; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 2514; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 2515; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 2516; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16 2517; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 2518; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 2519; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 2520; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 2521; GFX11-NEXT: s_setpc_b64 s[30:31] 2522; 2523; GFX12-LABEL: zero_init_large_offset_foo: 2524; GFX12: ; %bb.0: 2525; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2526; GFX12-NEXT: s_wait_expcnt 0x0 2527; GFX12-NEXT: s_wait_samplecnt 0x0 2528; GFX12-NEXT: s_wait_bvhcnt 0x0 2529; GFX12-NEXT: s_wait_kmcnt 0x0 2530; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS 2531; GFX12-NEXT: s_wait_loadcnt 0x0 2532; GFX12-NEXT: s_mov_b32 s0, 0 2533; GFX12-NEXT: s_wait_alu 0xfffe 2534; GFX12-NEXT: s_mov_b32 s1, s0 2535; GFX12-NEXT: s_mov_b32 s2, s0 2536; GFX12-NEXT: s_mov_b32 s3, s0 2537; GFX12-NEXT: s_wait_alu 0xfffe 2538; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2539; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2540; GFX12-NEXT: s_clause 0x3 2541; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16384 2542; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400 2543; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416 2544; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432 2545; GFX12-NEXT: s_setpc_b64 s[30:31] 2546; 2547; GFX9-PAL-LABEL: zero_init_large_offset_foo: 2548; GFX9-PAL: ; %bb.0: 2549; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2550; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc 2551; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2552; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2553; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2554; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2555; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2556; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2557; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2558; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2559; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2560; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2561; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2562; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2563; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2564; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2565; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2566; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2567; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2568; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2569; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2570; 2571; GFX940-LABEL: zero_init_large_offset_foo: 2572; GFX940: ; %bb.0: 2573; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2574; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:4 sc0 sc1 2575; GFX940-NEXT: s_waitcnt vmcnt(0) 2576; GFX940-NEXT: s_mov_b32 s0, 0 2577; GFX940-NEXT: s_mov_b32 s1, s0 2578; GFX940-NEXT: s_mov_b32 s2, s0 2579; GFX940-NEXT: s_mov_b32 s3, s0 2580; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2581; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2582; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 2583; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1 2584; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 2585; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1 2586; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 2587; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1 2588; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 2589; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1 2590; GFX940-NEXT: s_waitcnt vmcnt(0) 2591; GFX940-NEXT: s_setpc_b64 s[30:31] 2592; 2593; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 2594; GFX1010-PAL: ; %bb.0: 2595; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2596; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc 2597; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2598; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2599; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2600; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2601; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2602; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2603; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2604; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2605; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2606; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2607; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2608; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2609; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2610; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2611; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2612; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2613; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2614; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2615; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2616; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2617; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 2618; 2619; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 2620; GFX1030-PAL: ; %bb.0: 2621; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2622; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc 2623; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2624; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2625; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2626; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2627; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2628; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2629; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2630; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2631; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2632; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2633; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 2634; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2635; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 2636; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2637; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 2638; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2639; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 2640; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 2641; 2642; GFX11-PAL-LABEL: zero_init_large_offset_foo: 2643; GFX11-PAL: ; %bb.0: 2644; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2645; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc 2646; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2647; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2648; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2649; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2650; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2651; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2652; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2653; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2654; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2655; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 2656; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2657; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16 2658; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2659; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 2660; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 2661; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 2662; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 2663; 2664; GFX12-PAL-LABEL: zero_init_large_offset_foo: 2665; GFX12-PAL: ; %bb.0: 2666; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 2667; GFX12-PAL-NEXT: s_wait_expcnt 0x0 2668; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 2669; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 2670; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 2671; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS 2672; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2673; GFX12-PAL-NEXT: s_mov_b32 s0, 0 2674; GFX12-PAL-NEXT: s_wait_alu 0xfffe 2675; GFX12-PAL-NEXT: s_mov_b32 s1, s0 2676; GFX12-PAL-NEXT: s_mov_b32 s2, s0 2677; GFX12-PAL-NEXT: s_mov_b32 s3, s0 2678; GFX12-PAL-NEXT: s_wait_alu 0xfffe 2679; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2680; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2681; GFX12-PAL-NEXT: s_clause 0x3 2682; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16384 2683; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400 2684; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416 2685; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432 2686; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 2687 %padding = alloca [4096 x i32], align 4, addrspace(5) 2688 %alloca = alloca [32 x i16], align 2, addrspace(5) 2689 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 2690 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 2691 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) 2692 ret void 2693} 2694 2695define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 2696; GFX9-LABEL: store_load_sindex_large_offset_kernel: 2697; GFX9: ; %bb.0: ; %bb 2698; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 2699; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 2700; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 2701; GFX9-NEXT: s_mov_b32 s1, 0 2702; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc 2703; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2704; GFX9-NEXT: s_lshl_b32 s1, s0, 2 2705; GFX9-NEXT: s_and_b32 s0, s0, 15 2706; GFX9-NEXT: v_mov_b32_e32 v0, 15 2707; GFX9-NEXT: s_addk_i32 s1, 0x4004 2708; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2709; GFX9-NEXT: scratch_store_dword off, v0, s1 2710; GFX9-NEXT: s_waitcnt vmcnt(0) 2711; GFX9-NEXT: s_addk_i32 s0, 0x4004 2712; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2713; GFX9-NEXT: s_waitcnt vmcnt(0) 2714; GFX9-NEXT: s_endpgm 2715; 2716; GFX10-LABEL: store_load_sindex_large_offset_kernel: 2717; GFX10: ; %bb.0: ; %bb 2718; GFX10-NEXT: s_add_u32 s8, s8, s13 2719; GFX10-NEXT: s_addc_u32 s9, s9, 0 2720; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 2721; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 2722; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 2723; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2724; GFX10-NEXT: s_waitcnt vmcnt(0) 2725; GFX10-NEXT: v_mov_b32_e32 v0, 15 2726; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2727; GFX10-NEXT: s_and_b32 s1, s0, 15 2728; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2729; GFX10-NEXT: s_lshl_b32 s1, s1, 2 2730; GFX10-NEXT: s_addk_i32 s0, 0x4004 2731; GFX10-NEXT: s_addk_i32 s1, 0x4004 2732; GFX10-NEXT: scratch_store_dword off, v0, s0 2733; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2734; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 2735; GFX10-NEXT: s_waitcnt vmcnt(0) 2736; GFX10-NEXT: s_endpgm 2737; 2738; GFX11-LABEL: store_load_sindex_large_offset_kernel: 2739; GFX11: ; %bb.0: ; %bb 2740; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 2741; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2742; GFX11-NEXT: s_waitcnt vmcnt(0) 2743; GFX11-NEXT: v_mov_b32_e32 v0, 15 2744; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2745; GFX11-NEXT: s_and_b32 s1, s0, 15 2746; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2747; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2748; GFX11-NEXT: s_addk_i32 s0, 0x4004 2749; GFX11-NEXT: s_addk_i32 s1, 0x4004 2750; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2751; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2752; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2753; GFX11-NEXT: s_waitcnt vmcnt(0) 2754; GFX11-NEXT: s_endpgm 2755; 2756; GFX12-LABEL: store_load_sindex_large_offset_kernel: 2757; GFX12: ; %bb.0: ; %bb 2758; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 2759; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 2760; GFX12-NEXT: s_wait_loadcnt 0x0 2761; GFX12-NEXT: v_mov_b32_e32 v0, 15 2762; GFX12-NEXT: s_wait_kmcnt 0x0 2763; GFX12-NEXT: s_and_b32 s1, s0, 15 2764; GFX12-NEXT: s_lshl_b32 s0, s0, 2 2765; GFX12-NEXT: s_lshl_b32 s1, s1, 2 2766; GFX12-NEXT: s_addk_co_i32 s0, 0x4000 2767; GFX12-NEXT: s_addk_co_i32 s1, 0x4000 2768; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 2769; GFX12-NEXT: s_wait_storecnt 0x0 2770; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 2771; GFX12-NEXT: s_wait_loadcnt 0x0 2772; GFX12-NEXT: s_endpgm 2773; 2774; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 2775; GFX9-PAL: ; %bb.0: ; %bb 2776; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 2777; GFX9-PAL-NEXT: s_mov_b32 s12, s0 2778; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2779; GFX9-PAL-NEXT: s_mov_b32 s1, 0 2780; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 2781; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2782; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2783; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 2784; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 2785; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc 2786; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2787; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2788; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2789; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2790; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2791; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2792; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2793; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2794; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2795; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2796; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2797; GFX9-PAL-NEXT: s_endpgm 2798; 2799; GFX940-LABEL: store_load_sindex_large_offset_kernel: 2800; GFX940: ; %bb.0: ; %bb 2801; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 2802; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2803; GFX940-NEXT: s_waitcnt vmcnt(0) 2804; GFX940-NEXT: v_mov_b32_e32 v0, 15 2805; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2806; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2807; GFX940-NEXT: s_and_b32 s0, s0, 15 2808; GFX940-NEXT: s_addk_i32 s1, 0x4004 2809; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2810; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2811; GFX940-NEXT: s_waitcnt vmcnt(0) 2812; GFX940-NEXT: s_addk_i32 s0, 0x4004 2813; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2814; GFX940-NEXT: s_waitcnt vmcnt(0) 2815; GFX940-NEXT: s_endpgm 2816; 2817; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 2818; GFX1010-PAL: ; %bb.0: ; %bb 2819; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 2820; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 2821; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2822; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2823; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2824; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 2825; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 2826; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 2827; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 2828; GFX1010-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 2829; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 2830; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc 2831; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2832; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2833; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2834; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2835; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2836; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2837; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2838; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2839; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2840; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2841; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2842; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2843; GFX1010-PAL-NEXT: s_endpgm 2844; 2845; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 2846; GFX1030-PAL: ; %bb.0: ; %bb 2847; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 2848; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 2849; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 2850; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2851; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 2852; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 2853; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 2854; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 2855; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 2856; GFX1030-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 2857; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2858; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2859; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2860; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2861; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2862; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2863; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2864; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2865; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2866; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2867; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2868; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2869; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2870; GFX1030-PAL-NEXT: s_endpgm 2871; 2872; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: 2873; GFX11-PAL: ; %bb.0: ; %bb 2874; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 2875; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2876; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2877; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2878; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 2879; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2880; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2881; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2882; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2883; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2884; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2885; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2886; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2887; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2888; GFX11-PAL-NEXT: s_endpgm 2889; 2890; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: 2891; GFX12-PAL: ; %bb.0: ; %bb 2892; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 2893; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 2894; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2895; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 2896; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 2897; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 2898; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 2899; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 2900; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4000 2901; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4000 2902; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 2903; GFX12-PAL-NEXT: s_wait_storecnt 0x0 2904; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 2905; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 2906; GFX12-PAL-NEXT: s_endpgm 2907bb: 2908 %padding = alloca [4096 x i32], align 4, addrspace(5) 2909 %i = alloca [32 x float], align 4, addrspace(5) 2910 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 2911 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 2912 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 2913 store volatile i32 15, ptr addrspace(5) %i7, align 4 2914 %i9 = and i32 %idx, 15 2915 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 2916 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 2917 ret void 2918} 2919 2920define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 2921; GFX9-LABEL: store_load_sindex_large_offset_foo: 2922; GFX9: ; %bb.0: ; %bb 2923; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2924; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2925; GFX9-NEXT: s_mov_b32 s0, 0 2926; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc 2927; GFX9-NEXT: s_waitcnt vmcnt(0) 2928; GFX9-NEXT: s_lshl_b32 s0, s2, 2 2929; GFX9-NEXT: s_addk_i32 s0, 0x4004 2930; GFX9-NEXT: v_mov_b32_e32 v0, 15 2931; GFX9-NEXT: scratch_store_dword off, v0, s0 2932; GFX9-NEXT: s_waitcnt vmcnt(0) 2933; GFX9-NEXT: s_and_b32 s0, s2, 15 2934; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2935; GFX9-NEXT: s_addk_i32 s0, 0x4004 2936; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2937; GFX9-NEXT: s_waitcnt vmcnt(0) 2938; GFX9-NEXT: s_endpgm 2939; 2940; GFX10-LABEL: store_load_sindex_large_offset_foo: 2941; GFX10: ; %bb.0: ; %bb 2942; GFX10-NEXT: s_add_u32 s0, s0, s3 2943; GFX10-NEXT: s_addc_u32 s1, s1, 0 2944; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2945; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2946; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2947; GFX10-NEXT: s_waitcnt vmcnt(0) 2948; GFX10-NEXT: v_mov_b32_e32 v0, 15 2949; GFX10-NEXT: s_and_b32 s0, s2, 15 2950; GFX10-NEXT: s_lshl_b32 s1, s2, 2 2951; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2952; GFX10-NEXT: s_addk_i32 s1, 0x4004 2953; GFX10-NEXT: s_addk_i32 s0, 0x4004 2954; GFX10-NEXT: scratch_store_dword off, v0, s1 2955; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2956; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 2957; GFX10-NEXT: s_waitcnt vmcnt(0) 2958; GFX10-NEXT: s_endpgm 2959; 2960; GFX11-LABEL: store_load_sindex_large_offset_foo: 2961; GFX11: ; %bb.0: ; %bb 2962; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2963; GFX11-NEXT: s_waitcnt vmcnt(0) 2964; GFX11-NEXT: v_mov_b32_e32 v0, 15 2965; GFX11-NEXT: s_and_b32 s1, s0, 15 2966; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2967; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2968; GFX11-NEXT: s_addk_i32 s0, 0x4004 2969; GFX11-NEXT: s_addk_i32 s1, 0x4004 2970; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2971; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2972; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2973; GFX11-NEXT: s_waitcnt vmcnt(0) 2974; GFX11-NEXT: s_endpgm 2975; 2976; GFX12-LABEL: store_load_sindex_large_offset_foo: 2977; GFX12: ; %bb.0: ; %bb 2978; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 2979; GFX12-NEXT: s_wait_loadcnt 0x0 2980; GFX12-NEXT: v_mov_b32_e32 v0, 15 2981; GFX12-NEXT: s_and_b32 s1, s0, 15 2982; GFX12-NEXT: s_lshl_b32 s0, s0, 2 2983; GFX12-NEXT: s_lshl_b32 s1, s1, 2 2984; GFX12-NEXT: s_addk_co_i32 s0, 0x4000 2985; GFX12-NEXT: s_addk_co_i32 s1, 0x4000 2986; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 2987; GFX12-NEXT: s_wait_storecnt 0x0 2988; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 2989; GFX12-NEXT: s_wait_loadcnt 0x0 2990; GFX12-NEXT: s_endpgm 2991; 2992; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 2993; GFX9-PAL: ; %bb.0: ; %bb 2994; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2995; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2996; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2997; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2998; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2999; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 3000; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3001; GFX9-PAL-NEXT: s_mov_b32 s1, 0 3002; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc 3003; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3004; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 3005; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 3006; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 3007; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3008; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 3009; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 3010; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3011; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 3012; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 3013; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3014; GFX9-PAL-NEXT: s_endpgm 3015; 3016; GFX940-LABEL: store_load_sindex_large_offset_foo: 3017; GFX940: ; %bb.0: ; %bb 3018; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 3019; GFX940-NEXT: s_waitcnt vmcnt(0) 3020; GFX940-NEXT: s_lshl_b32 s1, s0, 2 3021; GFX940-NEXT: s_and_b32 s0, s0, 15 3022; GFX940-NEXT: s_addk_i32 s1, 0x4004 3023; GFX940-NEXT: v_mov_b32_e32 v0, 15 3024; GFX940-NEXT: s_lshl_b32 s0, s0, 2 3025; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 3026; GFX940-NEXT: s_waitcnt vmcnt(0) 3027; GFX940-NEXT: s_addk_i32 s0, 0x4004 3028; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 3029; GFX940-NEXT: s_waitcnt vmcnt(0) 3030; GFX940-NEXT: s_endpgm 3031; 3032; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 3033; GFX1010-PAL: ; %bb.0: ; %bb 3034; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 3035; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 3036; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3037; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 3038; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3039; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 3040; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 3041; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3042; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3043; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 3044; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc 3045; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3046; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 3047; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 3048; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 3049; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 3050; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 3051; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 3052; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 3053; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3054; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 3055; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3056; GFX1010-PAL-NEXT: s_endpgm 3057; 3058; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 3059; GFX1030-PAL: ; %bb.0: ; %bb 3060; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 3061; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 3062; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3063; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 3064; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3065; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 3066; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 3067; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3068; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3069; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 3070; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3071; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 3072; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 3073; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 3074; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 3075; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 3076; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 3077; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 3078; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3079; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 3080; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3081; GFX1030-PAL-NEXT: s_endpgm 3082; 3083; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: 3084; GFX11-PAL: ; %bb.0: ; %bb 3085; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 3086; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3087; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 3088; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 3089; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 3090; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 3091; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 3092; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 3093; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 3094; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3095; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 3096; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3097; GFX11-PAL-NEXT: s_endpgm 3098; 3099; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo: 3100; GFX12-PAL: ; %bb.0: ; %bb 3101; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS 3102; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3103; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 3104; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 3105; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 3106; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 3107; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4000 3108; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4000 3109; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS 3110; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3111; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS 3112; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3113; GFX12-PAL-NEXT: s_endpgm 3114bb: 3115 %padding = alloca [4096 x i32], align 4, addrspace(5) 3116 %i = alloca [32 x float], align 4, addrspace(5) 3117 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 3118 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 3119 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 3120 store volatile i32 15, ptr addrspace(5) %i7, align 4 3121 %i9 = and i32 %idx, 15 3122 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 3123 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 3124 ret void 3125} 3126 3127define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 3128; GFX9-LABEL: store_load_vindex_large_offset_kernel: 3129; GFX9: ; %bb.0: ; %bb 3130; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 3131; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 3132; GFX9-NEXT: s_mov_b32 s0, 0 3133; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc 3134; GFX9-NEXT: s_waitcnt vmcnt(0) 3135; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3136; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 3137; GFX9-NEXT: v_mov_b32_e32 v2, 15 3138; GFX9-NEXT: scratch_store_dword v1, v2, off 3139; GFX9-NEXT: s_waitcnt vmcnt(0) 3140; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 3141; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 3142; GFX9-NEXT: s_waitcnt vmcnt(0) 3143; GFX9-NEXT: s_endpgm 3144; 3145; GFX10-LABEL: store_load_vindex_large_offset_kernel: 3146; GFX10: ; %bb.0: ; %bb 3147; GFX10-NEXT: s_add_u32 s8, s8, s13 3148; GFX10-NEXT: s_addc_u32 s9, s9, 0 3149; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 3150; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 3151; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3152; GFX10-NEXT: v_mov_b32_e32 v2, 15 3153; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 3154; GFX10-NEXT: s_waitcnt vmcnt(0) 3155; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 3156; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 3157; GFX10-NEXT: scratch_store_dword v1, v2, off 3158; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3159; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 3160; GFX10-NEXT: s_waitcnt vmcnt(0) 3161; GFX10-NEXT: s_endpgm 3162; 3163; GFX11-LABEL: store_load_vindex_large_offset_kernel: 3164; GFX11: ; %bb.0: ; %bb 3165; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 3166; GFX11-NEXT: s_movk_i32 s0, 0x4004 3167; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 3168; GFX11-NEXT: s_waitcnt vmcnt(0) 3169; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 3170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3171; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 3172; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc 3173; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3174; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 3175; GFX11-NEXT: s_waitcnt vmcnt(0) 3176; GFX11-NEXT: s_endpgm 3177; 3178; GFX12-LABEL: store_load_vindex_large_offset_kernel: 3179; GFX12: ; %bb.0: ; %bb 3180; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 3181; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS 3182; GFX12-NEXT: s_wait_loadcnt 0x0 3183; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 3184; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3185; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 3186; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS 3187; GFX12-NEXT: s_wait_storecnt 0x0 3188; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS 3189; GFX12-NEXT: s_wait_loadcnt 0x0 3190; GFX12-NEXT: s_endpgm 3191; 3192; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 3193; GFX9-PAL: ; %bb.0: ; %bb 3194; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 3195; GFX9-PAL-NEXT: s_mov_b32 s12, s0 3196; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3197; GFX9-PAL-NEXT: s_mov_b32 s0, 0 3198; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3199; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 3200; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3201; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3202; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 3203; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 3204; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc 3205; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3206; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 3207; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 3208; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3209; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 3210; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 3211; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3212; GFX9-PAL-NEXT: s_endpgm 3213; 3214; GFX940-LABEL: store_load_vindex_large_offset_kernel: 3215; GFX940: ; %bb.0: ; %bb 3216; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 3217; GFX940-NEXT: s_waitcnt vmcnt(0) 3218; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3219; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 3220; GFX940-NEXT: v_mov_b32_e32 v1, 15 3221; GFX940-NEXT: s_movk_i32 s0, 0x4004 3222; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 3223; GFX940-NEXT: s_waitcnt vmcnt(0) 3224; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 3225; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 3226; GFX940-NEXT: s_waitcnt vmcnt(0) 3227; GFX940-NEXT: s_endpgm 3228; 3229; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 3230; GFX1010-PAL: ; %bb.0: ; %bb 3231; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 3232; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 3233; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3234; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 3235; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3236; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 3237; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 3238; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 3239; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 3240; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3241; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 3242; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 3243; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc 3244; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3245; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 3246; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 3247; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 3248; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3249; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 3250; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3251; GFX1010-PAL-NEXT: s_endpgm 3252; 3253; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 3254; GFX1030-PAL: ; %bb.0: ; %bb 3255; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 3256; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 3257; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3258; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 3259; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3260; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 3261; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 3262; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 3263; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 3264; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3265; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 3266; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 3267; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3268; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 3269; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 3270; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 3271; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3272; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 3273; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3274; GFX1030-PAL-NEXT: s_endpgm 3275; 3276; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: 3277; GFX11-PAL: ; %bb.0: ; %bb 3278; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 3279; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 3280; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 3281; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3282; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 3283; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3284; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 3285; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 dlc 3286; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3287; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 3288; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3289; GFX11-PAL-NEXT: s_endpgm 3290; 3291; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: 3292; GFX12-PAL: ; %bb.0: ; %bb 3293; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 3294; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS 3295; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3296; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 3297; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3298; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 3299; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS 3300; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3301; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS 3302; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3303; GFX12-PAL-NEXT: s_endpgm 3304bb: 3305 %padding = alloca [4096 x i32], align 4, addrspace(5) 3306 %i = alloca [32 x float], align 4, addrspace(5) 3307 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 3308 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 3309 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 3310 %i3 = zext i32 %i2 to i64 3311 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 3312 store volatile i32 15, ptr addrspace(5) %i7, align 4 3313 %i9 = sub nsw i32 31, %i2 3314 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 3315 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 3316 ret void 3317} 3318 3319define void @store_load_vindex_large_offset_foo(i32 %idx) { 3320; GFX9-LABEL: store_load_vindex_large_offset_foo: 3321; GFX9: ; %bb.0: ; %bb 3322; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3323; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 3324; GFX9-NEXT: s_waitcnt vmcnt(0) 3325; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 3326; GFX9-NEXT: v_mov_b32_e32 v1, s0 3327; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 3328; GFX9-NEXT: v_mov_b32_e32 v3, 15 3329; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 3330; GFX9-NEXT: scratch_store_dword v2, v3, off 3331; GFX9-NEXT: s_waitcnt vmcnt(0) 3332; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3333; GFX9-NEXT: scratch_load_dword v0, v0, off glc 3334; GFX9-NEXT: s_waitcnt vmcnt(0) 3335; GFX9-NEXT: s_setpc_b64 s[30:31] 3336; 3337; GFX10-LABEL: store_load_vindex_large_offset_foo: 3338; GFX10: ; %bb.0: ; %bb 3339; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3340; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 3341; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 3342; GFX10-NEXT: v_mov_b32_e32 v2, 15 3343; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 3344; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 3345; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3346; GFX10-NEXT: s_waitcnt vmcnt(0) 3347; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0 3348; GFX10-NEXT: scratch_store_dword v0, v2, off 3349; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3350; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 3351; GFX10-NEXT: s_waitcnt vmcnt(0) 3352; GFX10-NEXT: s_setpc_b64 s[30:31] 3353; 3354; GFX11-LABEL: store_load_vindex_large_offset_foo: 3355; GFX11: ; %bb.0: ; %bb 3356; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3357; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 3358; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 3359; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3360; GFX11-NEXT: s_waitcnt vmcnt(0) 3361; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 3362; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3363; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 3364; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc 3365; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3366; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc 3367; GFX11-NEXT: s_waitcnt vmcnt(0) 3368; GFX11-NEXT: s_setpc_b64 s[30:31] 3369; 3370; GFX12-LABEL: store_load_vindex_large_offset_foo: 3371; GFX12: ; %bb.0: ; %bb 3372; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3373; GFX12-NEXT: s_wait_expcnt 0x0 3374; GFX12-NEXT: s_wait_samplecnt 0x0 3375; GFX12-NEXT: s_wait_bvhcnt 0x0 3376; GFX12-NEXT: s_wait_kmcnt 0x0 3377; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 3378; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3379; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS 3380; GFX12-NEXT: s_wait_loadcnt 0x0 3381; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3382; GFX12-NEXT: s_wait_storecnt 0x0 3383; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS 3384; GFX12-NEXT: s_wait_storecnt 0x0 3385; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS 3386; GFX12-NEXT: s_wait_loadcnt 0x0 3387; GFX12-NEXT: s_setpc_b64 s[30:31] 3388; 3389; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 3390; GFX9-PAL: ; %bb.0: ; %bb 3391; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3392; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 3393; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3394; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 3395; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0 3396; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 3397; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 3398; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 3399; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 3400; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3401; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3402; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 3403; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3404; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3405; 3406; GFX940-LABEL: store_load_vindex_large_offset_foo: 3407; GFX940: ; %bb.0: ; %bb 3408; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3409; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 3410; GFX940-NEXT: s_waitcnt vmcnt(0) 3411; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 3412; GFX940-NEXT: v_mov_b32_e32 v1, s0 3413; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 3414; GFX940-NEXT: v_mov_b32_e32 v2, 15 3415; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 3416; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 3417; GFX940-NEXT: s_waitcnt vmcnt(0) 3418; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3419; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 3420; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 3421; GFX940-NEXT: s_waitcnt vmcnt(0) 3422; GFX940-NEXT: s_setpc_b64 s[30:31] 3423; 3424; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 3425; GFX10-PAL: ; %bb.0: ; %bb 3426; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3427; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 3428; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 3429; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 3430; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 3431; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 3432; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3433; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3434; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0 3435; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 3436; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3437; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 3438; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3439; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3440; 3441; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: 3442; GFX11-PAL: ; %bb.0: ; %bb 3443; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3444; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 3445; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 3446; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3447; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3448; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 3449; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3450; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 3451; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc 3452; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3453; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s0 glc dlc 3454; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3455; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3456; 3457; GFX12-PAL-LABEL: store_load_vindex_large_offset_foo: 3458; GFX12-PAL: ; %bb.0: ; %bb 3459; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 3460; GFX12-PAL-NEXT: s_wait_expcnt 0x0 3461; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 3462; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 3463; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 3464; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 3465; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3466; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS 3467; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3468; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3469; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3470; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS 3471; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3472; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS 3473; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3474; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 3475bb: 3476 %padding = alloca [4096 x i32], align 4, addrspace(5) 3477 %i = alloca [32 x float], align 4, addrspace(5) 3478 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef 3479 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 3480 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx 3481 store volatile i32 15, ptr addrspace(5) %i7, align 4 3482 %i9 = and i32 %idx, 15 3483 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 3484 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 3485 ret void 3486} 3487 3488define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 3489; GFX9-LABEL: store_load_large_imm_offset_kernel: 3490; GFX9: ; %bb.0: ; %bb 3491; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 3492; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 3493; GFX9-NEXT: v_mov_b32_e32 v0, 13 3494; GFX9-NEXT: s_mov_b32 s0, 0 3495; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 3496; GFX9-NEXT: s_waitcnt vmcnt(0) 3497; GFX9-NEXT: s_movk_i32 s0, 0x3000 3498; GFX9-NEXT: s_add_i32 s0, s0, 4 3499; GFX9-NEXT: v_mov_b32_e32 v0, 15 3500; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3501; GFX9-NEXT: s_waitcnt vmcnt(0) 3502; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3503; GFX9-NEXT: s_waitcnt vmcnt(0) 3504; GFX9-NEXT: s_endpgm 3505; 3506; GFX10-LABEL: store_load_large_imm_offset_kernel: 3507; GFX10: ; %bb.0: ; %bb 3508; GFX10-NEXT: s_add_u32 s8, s8, s13 3509; GFX10-NEXT: s_addc_u32 s9, s9, 0 3510; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 3511; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 3512; GFX10-NEXT: v_mov_b32_e32 v0, 13 3513; GFX10-NEXT: v_mov_b32_e32 v1, 15 3514; GFX10-NEXT: s_movk_i32 s0, 0x3800 3515; GFX10-NEXT: s_add_i32 s0, s0, 4 3516; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 3517; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3518; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3519; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3520; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3521; GFX10-NEXT: s_waitcnt vmcnt(0) 3522; GFX10-NEXT: s_endpgm 3523; 3524; GFX11-LABEL: store_load_large_imm_offset_kernel: 3525; GFX11: ; %bb.0: ; %bb 3526; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3527; GFX11-NEXT: v_mov_b32_e32 v2, 15 3528; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3529; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3530; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3531; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3532; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3533; GFX11-NEXT: s_waitcnt vmcnt(0) 3534; GFX11-NEXT: s_endpgm 3535; 3536; GFX12-LABEL: store_load_large_imm_offset_kernel: 3537; GFX12: ; %bb.0: ; %bb 3538; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 3539; GFX12-NEXT: scratch_store_b32 off, v0, off scope:SCOPE_SYS 3540; GFX12-NEXT: s_wait_storecnt 0x0 3541; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS 3542; GFX12-NEXT: s_wait_storecnt 0x0 3543; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS 3544; GFX12-NEXT: s_wait_loadcnt 0x0 3545; GFX12-NEXT: s_endpgm 3546; 3547; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 3548; GFX9-PAL: ; %bb.0: ; %bb 3549; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 3550; GFX9-PAL-NEXT: s_mov_b32 s12, s0 3551; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3552; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3553; GFX9-PAL-NEXT: s_mov_b32 s0, 0 3554; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3555; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3556; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 3557; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 3558; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 3559; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3560; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3561; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 3562; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3563; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3564; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3565; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3566; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3567; GFX9-PAL-NEXT: s_endpgm 3568; 3569; GFX940-LABEL: store_load_large_imm_offset_kernel: 3570; GFX940: ; %bb.0: ; %bb 3571; GFX940-NEXT: v_mov_b32_e32 v0, 13 3572; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 3573; GFX940-NEXT: s_waitcnt vmcnt(0) 3574; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3575; GFX940-NEXT: v_mov_b32_e32 v1, 15 3576; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 3577; GFX940-NEXT: s_waitcnt vmcnt(0) 3578; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 3579; GFX940-NEXT: s_waitcnt vmcnt(0) 3580; GFX940-NEXT: s_endpgm 3581; 3582; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 3583; GFX1010-PAL: ; %bb.0: ; %bb 3584; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] 3585; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 3586; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3587; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 3588; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3589; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 3590; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 3591; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 3592; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 3593; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 3594; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 3595; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 3596; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 3597; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 3598; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4 3599; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3600; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3601; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3602; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3603; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3604; GFX1010-PAL-NEXT: s_endpgm 3605; 3606; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 3607; GFX1030-PAL: ; %bb.0: ; %bb 3608; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] 3609; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 3610; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3611; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 3612; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3613; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 3614; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 3615; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 3616; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 3617; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 3618; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 3619; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 3620; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 3621; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 3622; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3623; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3624; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3625; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3626; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3627; GFX1030-PAL-NEXT: s_endpgm 3628; 3629; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: 3630; GFX11-PAL: ; %bb.0: ; %bb 3631; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3632; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3633; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3634; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3635; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3636; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3637; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3638; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3639; GFX11-PAL-NEXT: s_endpgm 3640; 3641; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel: 3642; GFX12-PAL: ; %bb.0: ; %bb 3643; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 3644; GFX12-PAL-NEXT: scratch_store_b32 off, v0, off scope:SCOPE_SYS 3645; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3646; GFX12-PAL-NEXT: scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS 3647; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3648; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS 3649; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3650; GFX12-PAL-NEXT: s_endpgm 3651bb: 3652 %i = alloca [4096 x i32], align 4, addrspace(5) 3653 %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef 3654 store volatile i32 13, ptr addrspace(5) %i1, align 4 3655 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 3656 store volatile i32 15, ptr addrspace(5) %i7, align 4 3657 %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 3658 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 3659 ret void 3660} 3661 3662define void @store_load_large_imm_offset_foo() { 3663; GFX9-LABEL: store_load_large_imm_offset_foo: 3664; GFX9: ; %bb.0: ; %bb 3665; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3666; GFX9-NEXT: s_movk_i32 s0, 0x3000 3667; GFX9-NEXT: v_mov_b32_e32 v0, 13 3668; GFX9-NEXT: s_add_i32 s1, s32, s0 3669; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 3670; GFX9-NEXT: s_waitcnt vmcnt(0) 3671; GFX9-NEXT: s_add_i32 s0, s1, 4 3672; GFX9-NEXT: v_mov_b32_e32 v0, 15 3673; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3674; GFX9-NEXT: s_waitcnt vmcnt(0) 3675; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3676; GFX9-NEXT: s_waitcnt vmcnt(0) 3677; GFX9-NEXT: s_setpc_b64 s[30:31] 3678; 3679; GFX10-LABEL: store_load_large_imm_offset_foo: 3680; GFX10: ; %bb.0: ; %bb 3681; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3682; GFX10-NEXT: v_mov_b32_e32 v0, 13 3683; GFX10-NEXT: s_movk_i32 s0, 0x3800 3684; GFX10-NEXT: v_mov_b32_e32 v1, 15 3685; GFX10-NEXT: s_add_i32 s1, s32, s0 3686; GFX10-NEXT: s_add_i32 s0, s1, 4 3687; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 3688; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3689; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3690; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3691; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3692; GFX10-NEXT: s_waitcnt vmcnt(0) 3693; GFX10-NEXT: s_setpc_b64 s[30:31] 3694; 3695; GFX11-LABEL: store_load_large_imm_offset_foo: 3696; GFX11: ; %bb.0: ; %bb 3697; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3698; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3699; GFX11-NEXT: v_mov_b32_e32 v2, 15 3700; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3701; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3702; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3703; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3704; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3705; GFX11-NEXT: s_waitcnt vmcnt(0) 3706; GFX11-NEXT: s_setpc_b64 s[30:31] 3707; 3708; GFX12-LABEL: store_load_large_imm_offset_foo: 3709; GFX12: ; %bb.0: ; %bb 3710; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3711; GFX12-NEXT: s_wait_expcnt 0x0 3712; GFX12-NEXT: s_wait_samplecnt 0x0 3713; GFX12-NEXT: s_wait_bvhcnt 0x0 3714; GFX12-NEXT: s_wait_kmcnt 0x0 3715; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 3716; GFX12-NEXT: s_wait_storecnt 0x0 3717; GFX12-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS 3718; GFX12-NEXT: s_wait_storecnt 0x0 3719; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS 3720; GFX12-NEXT: s_wait_storecnt 0x0 3721; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS 3722; GFX12-NEXT: s_wait_loadcnt 0x0 3723; GFX12-NEXT: s_setpc_b64 s[30:31] 3724; 3725; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 3726; GFX9-PAL: ; %bb.0: ; %bb 3727; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3728; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3729; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3730; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0 3731; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3732; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3733; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4 3734; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3735; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3736; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3737; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3738; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3739; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3740; 3741; GFX940-LABEL: store_load_large_imm_offset_foo: 3742; GFX940: ; %bb.0: ; %bb 3743; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3744; GFX940-NEXT: v_mov_b32_e32 v0, 13 3745; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 3746; GFX940-NEXT: s_waitcnt vmcnt(0) 3747; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3748; GFX940-NEXT: v_mov_b32_e32 v1, 15 3749; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 3750; GFX940-NEXT: s_waitcnt vmcnt(0) 3751; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 3752; GFX940-NEXT: s_waitcnt vmcnt(0) 3753; GFX940-NEXT: s_setpc_b64 s[30:31] 3754; 3755; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 3756; GFX10-PAL: ; %bb.0: ; %bb 3757; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3758; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 3759; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 3760; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3761; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0 3762; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4 3763; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3764; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3765; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3766; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3767; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3768; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3769; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3770; 3771; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: 3772; GFX11-PAL: ; %bb.0: ; %bb 3773; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3774; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3775; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3776; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3777; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3778; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3779; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3780; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3781; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3782; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3783; 3784; GFX12-PAL-LABEL: store_load_large_imm_offset_foo: 3785; GFX12-PAL: ; %bb.0: ; %bb 3786; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 3787; GFX12-PAL-NEXT: s_wait_expcnt 0x0 3788; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 3789; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 3790; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 3791; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 3792; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3793; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS 3794; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3795; GFX12-PAL-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS 3796; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3797; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS 3798; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3799; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 3800bb: 3801 %i = alloca [4096 x i32], align 4, addrspace(5) 3802 %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef 3803 store volatile i32 13, ptr addrspace(5) %i1, align 4 3804 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 3805 store volatile i32 15, ptr addrspace(5) %i7, align 4 3806 %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000 3807 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 3808 ret void 3809} 3810 3811define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 3812; GFX9-LABEL: store_load_vidx_sidx_offset: 3813; GFX9: ; %bb.0: ; %bb 3814; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 3815; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 3816; GFX9-NEXT: v_mov_b32_e32 v1, 0 3817; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 3818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3819; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 3820; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3821; GFX9-NEXT: v_mov_b32_e32 v1, 15 3822; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 3823; GFX9-NEXT: s_waitcnt vmcnt(0) 3824; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3825; GFX9-NEXT: s_waitcnt vmcnt(0) 3826; GFX9-NEXT: s_endpgm 3827; 3828; GFX10-LABEL: store_load_vidx_sidx_offset: 3829; GFX10: ; %bb.0: ; %bb 3830; GFX10-NEXT: s_add_u32 s8, s8, s13 3831; GFX10-NEXT: s_addc_u32 s9, s9, 0 3832; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 3833; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 3834; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 3835; GFX10-NEXT: v_mov_b32_e32 v1, 15 3836; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3837; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 3838; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 0 3839; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 3840; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3841; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3842; GFX10-NEXT: s_waitcnt vmcnt(0) 3843; GFX10-NEXT: s_endpgm 3844; 3845; GFX11-LABEL: store_load_vidx_sidx_offset: 3846; GFX11: ; %bb.0: ; %bb 3847; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 3848; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 3849; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3850; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3851; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 3852; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 0 3853; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc 3854; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3855; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc 3856; GFX11-NEXT: s_waitcnt vmcnt(0) 3857; GFX11-NEXT: s_endpgm 3858; 3859; GFX12-LABEL: store_load_vidx_sidx_offset: 3860; GFX12: ; %bb.0: ; %bb 3861; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 3862; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 3863; GFX12-NEXT: s_wait_kmcnt 0x0 3864; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3865; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3866; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS 3867; GFX12-NEXT: s_wait_storecnt 0x0 3868; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS 3869; GFX12-NEXT: s_wait_loadcnt 0x0 3870; GFX12-NEXT: s_endpgm 3871; 3872; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 3873; GFX9-PAL: ; %bb.0: ; %bb 3874; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] 3875; GFX9-PAL-NEXT: s_mov_b32 s12, s0 3876; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3877; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 3878; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 3879; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3881; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 3882; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 3883; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 3884; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3885; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3886; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3887; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3888; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3889; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3890; GFX9-PAL-NEXT: s_endpgm 3891; 3892; GFX940-LABEL: store_load_vidx_sidx_offset: 3893; GFX940: ; %bb.0: ; %bb 3894; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 3895; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3896; GFX940-NEXT: v_mov_b32_e32 v1, 0 3897; GFX940-NEXT: s_waitcnt lgkmcnt(0) 3898; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 3899; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3900; GFX940-NEXT: v_mov_b32_e32 v1, 15 3901; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 3902; GFX940-NEXT: s_waitcnt vmcnt(0) 3903; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 3904; GFX940-NEXT: s_waitcnt vmcnt(0) 3905; GFX940-NEXT: s_endpgm 3906; 3907; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 3908; GFX10-PAL: ; %bb.0: ; %bb 3909; GFX10-PAL-NEXT: s_getpc_b64 s[12:13] 3910; GFX10-PAL-NEXT: s_mov_b32 s12, s0 3911; GFX10-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 3912; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3913; GFX10-PAL-NEXT: s_and_b32 s13, s13, 0xffff 3914; GFX10-PAL-NEXT: s_add_u32 s12, s12, s11 3915; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 3916; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 3917; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 3918; GFX10-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 3919; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3920; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3921; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 3922; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 3923; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3924; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3925; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3926; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3927; GFX10-PAL-NEXT: s_endpgm 3928; 3929; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: 3930; GFX11-PAL: ; %bb.0: ; %bb 3931; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 3932; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 3933; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 3934; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3935; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 3936; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 3937; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc 3938; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3939; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc 3940; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3941; GFX11-PAL-NEXT: s_endpgm 3942; 3943; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: 3944; GFX12-PAL: ; %bb.0: ; %bb 3945; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 3946; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 3947; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 3948; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3949; GFX12-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3950; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS 3951; GFX12-PAL-NEXT: s_wait_storecnt 0x0 3952; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS 3953; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 3954; GFX12-PAL-NEXT: s_endpgm 3955bb: 3956 %alloca = alloca [32 x i32], align 4, addrspace(5) 3957 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 3958 %add1 = add nsw i32 %sidx, %vidx 3959 %add2 = add nsw i32 %add1, 256 3960 %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2 3961 store volatile i32 15, ptr addrspace(5) %gep, align 4 3962 %load = load volatile i32, ptr addrspace(5) %gep, align 4 3963 ret void 3964} 3965 3966define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { 3967; GFX9-LABEL: store_load_i64_aligned: 3968; GFX9: ; %bb.0: ; %bb 3969; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3970; GFX9-NEXT: v_mov_b32_e32 v1, 15 3971; GFX9-NEXT: v_mov_b32_e32 v2, 0 3972; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3973; GFX9-NEXT: s_waitcnt vmcnt(0) 3974; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3975; GFX9-NEXT: s_waitcnt vmcnt(0) 3976; GFX9-NEXT: s_setpc_b64 s[30:31] 3977; 3978; GFX10-LABEL: store_load_i64_aligned: 3979; GFX10: ; %bb.0: ; %bb 3980; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3981; GFX10-NEXT: v_mov_b32_e32 v1, 15 3982; GFX10-NEXT: v_mov_b32_e32 v2, 0 3983; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3984; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3985; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3986; GFX10-NEXT: s_waitcnt vmcnt(0) 3987; GFX10-NEXT: s_setpc_b64 s[30:31] 3988; 3989; GFX11-LABEL: store_load_i64_aligned: 3990; GFX11: ; %bb.0: ; %bb 3991; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3992; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 3993; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3994; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3995; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3996; GFX11-NEXT: s_waitcnt vmcnt(0) 3997; GFX11-NEXT: s_setpc_b64 s[30:31] 3998; 3999; GFX12-LABEL: store_load_i64_aligned: 4000; GFX12: ; %bb.0: ; %bb 4001; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4002; GFX12-NEXT: s_wait_expcnt 0x0 4003; GFX12-NEXT: s_wait_samplecnt 0x0 4004; GFX12-NEXT: s_wait_bvhcnt 0x0 4005; GFX12-NEXT: s_wait_kmcnt 0x0 4006; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4007; GFX12-NEXT: s_wait_storecnt 0x0 4008; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS 4009; GFX12-NEXT: s_wait_storecnt 0x0 4010; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS 4011; GFX12-NEXT: s_wait_loadcnt 0x0 4012; GFX12-NEXT: s_setpc_b64 s[30:31] 4013; 4014; GFX9-PAL-LABEL: store_load_i64_aligned: 4015; GFX9-PAL: ; %bb.0: ; %bb 4016; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4017; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 4018; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 4019; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 4020; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4021; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 4022; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4023; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4024; 4025; GFX940-LABEL: store_load_i64_aligned: 4026; GFX940: ; %bb.0: ; %bb 4027; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4028; GFX940-NEXT: v_mov_b32_e32 v2, 15 4029; GFX940-NEXT: v_mov_b32_e32 v3, 0 4030; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 4031; GFX940-NEXT: s_waitcnt vmcnt(0) 4032; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 4033; GFX940-NEXT: s_waitcnt vmcnt(0) 4034; GFX940-NEXT: s_setpc_b64 s[30:31] 4035; 4036; GFX10-PAL-LABEL: store_load_i64_aligned: 4037; GFX10-PAL: ; %bb.0: ; %bb 4038; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4039; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 4040; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 4041; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 4042; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4043; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 4044; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4045; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 4046; 4047; GFX11-PAL-LABEL: store_load_i64_aligned: 4048; GFX11-PAL: ; %bb.0: ; %bb 4049; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4050; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4051; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 4052; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4053; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 4054; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4055; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4056; 4057; GFX12-PAL-LABEL: store_load_i64_aligned: 4058; GFX12-PAL: ; %bb.0: ; %bb 4059; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 4060; GFX12-PAL-NEXT: s_wait_expcnt 0x0 4061; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 4062; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 4063; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 4064; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4065; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4066; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS 4067; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4068; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS 4069; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4070; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 4071bb: 4072 store volatile i64 15, ptr addrspace(5) %arg, align 8 4073 %load = load volatile i64, ptr addrspace(5) %arg, align 8 4074 ret void 4075} 4076 4077define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { 4078; GFX9-LABEL: store_load_i64_unaligned: 4079; GFX9: ; %bb.0: ; %bb 4080; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4081; GFX9-NEXT: v_mov_b32_e32 v1, 15 4082; GFX9-NEXT: v_mov_b32_e32 v2, 0 4083; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 4084; GFX9-NEXT: s_waitcnt vmcnt(0) 4085; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 4086; GFX9-NEXT: s_waitcnt vmcnt(0) 4087; GFX9-NEXT: s_setpc_b64 s[30:31] 4088; 4089; GFX10-LABEL: store_load_i64_unaligned: 4090; GFX10: ; %bb.0: ; %bb 4091; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4092; GFX10-NEXT: v_mov_b32_e32 v1, 15 4093; GFX10-NEXT: v_mov_b32_e32 v2, 0 4094; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 4095; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4096; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 4097; GFX10-NEXT: s_waitcnt vmcnt(0) 4098; GFX10-NEXT: s_setpc_b64 s[30:31] 4099; 4100; GFX11-LABEL: store_load_i64_unaligned: 4101; GFX11: ; %bb.0: ; %bb 4102; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4103; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4104; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 4105; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4106; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 4107; GFX11-NEXT: s_waitcnt vmcnt(0) 4108; GFX11-NEXT: s_setpc_b64 s[30:31] 4109; 4110; GFX12-LABEL: store_load_i64_unaligned: 4111; GFX12: ; %bb.0: ; %bb 4112; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4113; GFX12-NEXT: s_wait_expcnt 0x0 4114; GFX12-NEXT: s_wait_samplecnt 0x0 4115; GFX12-NEXT: s_wait_bvhcnt 0x0 4116; GFX12-NEXT: s_wait_kmcnt 0x0 4117; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4118; GFX12-NEXT: s_wait_storecnt 0x0 4119; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS 4120; GFX12-NEXT: s_wait_storecnt 0x0 4121; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS 4122; GFX12-NEXT: s_wait_loadcnt 0x0 4123; GFX12-NEXT: s_setpc_b64 s[30:31] 4124; 4125; GFX9-PAL-LABEL: store_load_i64_unaligned: 4126; GFX9-PAL: ; %bb.0: ; %bb 4127; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4128; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 4129; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 4130; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 4131; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4132; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 4133; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4134; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4135; 4136; GFX940-LABEL: store_load_i64_unaligned: 4137; GFX940: ; %bb.0: ; %bb 4138; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4139; GFX940-NEXT: v_mov_b32_e32 v2, 15 4140; GFX940-NEXT: v_mov_b32_e32 v3, 0 4141; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 4142; GFX940-NEXT: s_waitcnt vmcnt(0) 4143; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 4144; GFX940-NEXT: s_waitcnt vmcnt(0) 4145; GFX940-NEXT: s_setpc_b64 s[30:31] 4146; 4147; GFX10-PAL-LABEL: store_load_i64_unaligned: 4148; GFX10-PAL: ; %bb.0: ; %bb 4149; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4150; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 4151; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 4152; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 4153; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4154; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 4155; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4156; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 4157; 4158; GFX11-PAL-LABEL: store_load_i64_unaligned: 4159; GFX11-PAL: ; %bb.0: ; %bb 4160; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4161; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4162; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 4163; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4164; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 4165; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4166; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4167; 4168; GFX12-PAL-LABEL: store_load_i64_unaligned: 4169; GFX12-PAL: ; %bb.0: ; %bb 4170; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 4171; GFX12-PAL-NEXT: s_wait_expcnt 0x0 4172; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 4173; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 4174; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 4175; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 4176; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4177; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS 4178; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4179; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS 4180; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4181; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 4182bb: 4183 store volatile i64 15, ptr addrspace(5) %arg, align 1 4184 %load = load volatile i64, ptr addrspace(5) %arg, align 1 4185 ret void 4186} 4187 4188define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { 4189; GFX9-LABEL: store_load_v3i32_unaligned: 4190; GFX9: ; %bb.0: ; %bb 4191; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4192; GFX9-NEXT: v_mov_b32_e32 v1, 1 4193; GFX9-NEXT: v_mov_b32_e32 v2, 2 4194; GFX9-NEXT: v_mov_b32_e32 v3, 3 4195; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 4196; GFX9-NEXT: s_waitcnt vmcnt(0) 4197; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 4198; GFX9-NEXT: s_waitcnt vmcnt(0) 4199; GFX9-NEXT: s_setpc_b64 s[30:31] 4200; 4201; GFX10-LABEL: store_load_v3i32_unaligned: 4202; GFX10: ; %bb.0: ; %bb 4203; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4204; GFX10-NEXT: v_mov_b32_e32 v1, 1 4205; GFX10-NEXT: v_mov_b32_e32 v2, 2 4206; GFX10-NEXT: v_mov_b32_e32 v3, 3 4207; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 4208; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4209; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 4210; GFX10-NEXT: s_waitcnt vmcnt(0) 4211; GFX10-NEXT: s_setpc_b64 s[30:31] 4212; 4213; GFX11-LABEL: store_load_v3i32_unaligned: 4214; GFX11: ; %bb.0: ; %bb 4215; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4216; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4217; GFX11-NEXT: v_mov_b32_e32 v3, 3 4218; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc 4219; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4220; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 4221; GFX11-NEXT: s_waitcnt vmcnt(0) 4222; GFX11-NEXT: s_setpc_b64 s[30:31] 4223; 4224; GFX12-LABEL: store_load_v3i32_unaligned: 4225; GFX12: ; %bb.0: ; %bb 4226; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4227; GFX12-NEXT: s_wait_expcnt 0x0 4228; GFX12-NEXT: s_wait_samplecnt 0x0 4229; GFX12-NEXT: s_wait_bvhcnt 0x0 4230; GFX12-NEXT: s_wait_kmcnt 0x0 4231; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4232; GFX12-NEXT: v_mov_b32_e32 v3, 3 4233; GFX12-NEXT: s_wait_storecnt 0x0 4234; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS 4235; GFX12-NEXT: s_wait_storecnt 0x0 4236; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS 4237; GFX12-NEXT: s_wait_loadcnt 0x0 4238; GFX12-NEXT: s_setpc_b64 s[30:31] 4239; 4240; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 4241; GFX9-PAL: ; %bb.0: ; %bb 4242; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4243; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4244; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 4245; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 4246; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 4247; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4248; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 4249; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4250; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4251; 4252; GFX940-LABEL: store_load_v3i32_unaligned: 4253; GFX940: ; %bb.0: ; %bb 4254; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4255; GFX940-NEXT: v_mov_b32_e32 v2, 1 4256; GFX940-NEXT: v_mov_b32_e32 v3, 2 4257; GFX940-NEXT: v_mov_b32_e32 v4, 3 4258; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 4259; GFX940-NEXT: s_waitcnt vmcnt(0) 4260; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 4261; GFX940-NEXT: s_waitcnt vmcnt(0) 4262; GFX940-NEXT: s_setpc_b64 s[30:31] 4263; 4264; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 4265; GFX10-PAL: ; %bb.0: ; %bb 4266; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4267; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 4268; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 4269; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 4270; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 4271; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4272; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 4273; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4274; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 4275; 4276; GFX11-PAL-LABEL: store_load_v3i32_unaligned: 4277; GFX11-PAL: ; %bb.0: ; %bb 4278; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4279; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4280; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 4281; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc 4282; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4283; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 4284; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4285; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4286; 4287; GFX12-PAL-LABEL: store_load_v3i32_unaligned: 4288; GFX12-PAL: ; %bb.0: ; %bb 4289; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 4290; GFX12-PAL-NEXT: s_wait_expcnt 0x0 4291; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 4292; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 4293; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 4294; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4295; GFX12-PAL-NEXT: v_mov_b32_e32 v3, 3 4296; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4297; GFX12-PAL-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS 4298; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4299; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS 4300; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4301; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 4302bb: 4303 store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1 4304 %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1 4305 ret void 4306} 4307 4308define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { 4309; GFX9-LABEL: store_load_v4i32_unaligned: 4310; GFX9: ; %bb.0: ; %bb 4311; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4312; GFX9-NEXT: v_mov_b32_e32 v1, 1 4313; GFX9-NEXT: v_mov_b32_e32 v2, 2 4314; GFX9-NEXT: v_mov_b32_e32 v3, 3 4315; GFX9-NEXT: v_mov_b32_e32 v4, 4 4316; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 4317; GFX9-NEXT: s_waitcnt vmcnt(0) 4318; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 4319; GFX9-NEXT: s_waitcnt vmcnt(0) 4320; GFX9-NEXT: s_setpc_b64 s[30:31] 4321; 4322; GFX10-LABEL: store_load_v4i32_unaligned: 4323; GFX10: ; %bb.0: ; %bb 4324; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4325; GFX10-NEXT: v_mov_b32_e32 v1, 1 4326; GFX10-NEXT: v_mov_b32_e32 v2, 2 4327; GFX10-NEXT: v_mov_b32_e32 v3, 3 4328; GFX10-NEXT: v_mov_b32_e32 v4, 4 4329; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 4330; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4331; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 4332; GFX10-NEXT: s_waitcnt vmcnt(0) 4333; GFX10-NEXT: s_setpc_b64 s[30:31] 4334; 4335; GFX11-LABEL: store_load_v4i32_unaligned: 4336; GFX11: ; %bb.0: ; %bb 4337; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4338; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4339; GFX11-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 4340; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc 4341; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4342; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 4343; GFX11-NEXT: s_waitcnt vmcnt(0) 4344; GFX11-NEXT: s_setpc_b64 s[30:31] 4345; 4346; GFX12-LABEL: store_load_v4i32_unaligned: 4347; GFX12: ; %bb.0: ; %bb 4348; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4349; GFX12-NEXT: s_wait_expcnt 0x0 4350; GFX12-NEXT: s_wait_samplecnt 0x0 4351; GFX12-NEXT: s_wait_bvhcnt 0x0 4352; GFX12-NEXT: s_wait_kmcnt 0x0 4353; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4354; GFX12-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 4355; GFX12-NEXT: s_wait_storecnt 0x0 4356; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS 4357; GFX12-NEXT: s_wait_storecnt 0x0 4358; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS 4359; GFX12-NEXT: s_wait_loadcnt 0x0 4360; GFX12-NEXT: s_setpc_b64 s[30:31] 4361; 4362; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 4363; GFX9-PAL: ; %bb.0: ; %bb 4364; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4365; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4366; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 4367; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 4368; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 4369; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 4370; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4371; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 4372; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4373; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4374; 4375; GFX940-LABEL: store_load_v4i32_unaligned: 4376; GFX940: ; %bb.0: ; %bb 4377; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4378; GFX940-NEXT: v_mov_b32_e32 v2, 1 4379; GFX940-NEXT: v_mov_b32_e32 v3, 2 4380; GFX940-NEXT: v_mov_b32_e32 v4, 3 4381; GFX940-NEXT: v_mov_b32_e32 v5, 4 4382; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 4383; GFX940-NEXT: s_waitcnt vmcnt(0) 4384; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 4385; GFX940-NEXT: s_waitcnt vmcnt(0) 4386; GFX940-NEXT: s_setpc_b64 s[30:31] 4387; 4388; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 4389; GFX10-PAL: ; %bb.0: ; %bb 4390; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4391; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 4392; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 4393; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 4394; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 4395; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 4396; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4397; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 4398; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4399; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 4400; 4401; GFX11-PAL-LABEL: store_load_v4i32_unaligned: 4402; GFX11-PAL: ; %bb.0: ; %bb 4403; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4404; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4405; GFX11-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 4406; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc 4407; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4408; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 4409; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4410; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4411; 4412; GFX12-PAL-LABEL: store_load_v4i32_unaligned: 4413; GFX12-PAL: ; %bb.0: ; %bb 4414; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 4415; GFX12-PAL-NEXT: s_wait_expcnt 0x0 4416; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 4417; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 4418; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 4419; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 4420; GFX12-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 4421; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4422; GFX12-PAL-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS 4423; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4424; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS 4425; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4426; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 4427bb: 4428 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1 4429 %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1 4430 ret void 4431} 4432 4433define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) { 4434; GFX9-LABEL: store_load_i32_negative_unaligned: 4435; GFX9: ; %bb.0: ; %bb 4436; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4437; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 4438; GFX9-NEXT: v_mov_b32_e32 v1, 1 4439; GFX9-NEXT: scratch_store_byte v0, v1, off 4440; GFX9-NEXT: s_waitcnt vmcnt(0) 4441; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 4442; GFX9-NEXT: s_waitcnt vmcnt(0) 4443; GFX9-NEXT: s_setpc_b64 s[30:31] 4444; 4445; GFX10-LABEL: store_load_i32_negative_unaligned: 4446; GFX10: ; %bb.0: ; %bb 4447; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4448; GFX10-NEXT: v_mov_b32_e32 v1, 1 4449; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 4450; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4451; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 4452; GFX10-NEXT: s_waitcnt vmcnt(0) 4453; GFX10-NEXT: s_setpc_b64 s[30:31] 4454; 4455; GFX11-LABEL: store_load_i32_negative_unaligned: 4456; GFX11: ; %bb.0: ; %bb 4457; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4458; GFX11-NEXT: v_mov_b32_e32 v1, 1 4459; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 4460; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4461; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4462; GFX11-NEXT: s_waitcnt vmcnt(0) 4463; GFX11-NEXT: s_setpc_b64 s[30:31] 4464; 4465; GFX12-LABEL: store_load_i32_negative_unaligned: 4466; GFX12: ; %bb.0: ; %bb 4467; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4468; GFX12-NEXT: s_wait_expcnt 0x0 4469; GFX12-NEXT: s_wait_samplecnt 0x0 4470; GFX12-NEXT: s_wait_bvhcnt 0x0 4471; GFX12-NEXT: s_wait_kmcnt 0x0 4472; GFX12-NEXT: v_mov_b32_e32 v1, 1 4473; GFX12-NEXT: s_wait_storecnt 0x0 4474; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS 4475; GFX12-NEXT: s_wait_storecnt 0x0 4476; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS 4477; GFX12-NEXT: s_wait_loadcnt 0x0 4478; GFX12-NEXT: s_setpc_b64 s[30:31] 4479; 4480; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 4481; GFX9-PAL: ; %bb.0: ; %bb 4482; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4483; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 4484; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4485; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4486; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4487; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4488; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4489; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4490; 4491; GFX940-LABEL: store_load_i32_negative_unaligned: 4492; GFX940: ; %bb.0: ; %bb 4493; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4494; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 4495; GFX940-NEXT: v_mov_b32_e32 v1, 1 4496; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 4497; GFX940-NEXT: s_waitcnt vmcnt(0) 4498; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 4499; GFX940-NEXT: s_waitcnt vmcnt(0) 4500; GFX940-NEXT: s_setpc_b64 s[30:31] 4501; 4502; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 4503; GFX1010-PAL: ; %bb.0: ; %bb 4504; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4505; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 4506; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4507; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 4508; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4509; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 4510; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4511; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4512; 4513; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 4514; GFX1030-PAL: ; %bb.0: ; %bb 4515; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4516; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4517; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 4518; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4519; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 4520; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4521; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4522; 4523; GFX11-PAL-LABEL: store_load_i32_negative_unaligned: 4524; GFX11-PAL: ; %bb.0: ; %bb 4525; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4526; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 4527; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 4528; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4529; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4530; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4531; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4532; 4533; GFX12-PAL-LABEL: store_load_i32_negative_unaligned: 4534; GFX12-PAL: ; %bb.0: ; %bb 4535; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 4536; GFX12-PAL-NEXT: s_wait_expcnt 0x0 4537; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 4538; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 4539; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 4540; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 4541; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4542; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS 4543; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4544; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS 4545; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4546; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 4547bb: 4548 %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1 4549 store volatile i8 1, ptr addrspace(5) %ptr, align 1 4550 %load = load volatile i8, ptr addrspace(5) %ptr, align 1 4551 ret void 4552} 4553 4554define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture %arg) { 4555; GFX9-LABEL: store_load_i32_large_negative_unaligned: 4556; GFX9: ; %bb.0: ; %bb 4557; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4558; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4559; GFX9-NEXT: v_mov_b32_e32 v1, 1 4560; GFX9-NEXT: scratch_store_byte v0, v1, off 4561; GFX9-NEXT: s_waitcnt vmcnt(0) 4562; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 4563; GFX9-NEXT: s_waitcnt vmcnt(0) 4564; GFX9-NEXT: s_setpc_b64 s[30:31] 4565; 4566; GFX10-LABEL: store_load_i32_large_negative_unaligned: 4567; GFX10: ; %bb.0: ; %bb 4568; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4569; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4570; GFX10-NEXT: v_mov_b32_e32 v1, 1 4571; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 4572; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4573; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4574; GFX10-NEXT: s_waitcnt vmcnt(0) 4575; GFX10-NEXT: s_setpc_b64 s[30:31] 4576; 4577; GFX11-LABEL: store_load_i32_large_negative_unaligned: 4578; GFX11: ; %bb.0: ; %bb 4579; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4580; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 4581; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4582; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4583; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4584; GFX11-NEXT: s_waitcnt vmcnt(0) 4585; GFX11-NEXT: s_setpc_b64 s[30:31] 4586; 4587; GFX12-LABEL: store_load_i32_large_negative_unaligned: 4588; GFX12: ; %bb.0: ; %bb 4589; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4590; GFX12-NEXT: s_wait_expcnt 0x0 4591; GFX12-NEXT: s_wait_samplecnt 0x0 4592; GFX12-NEXT: s_wait_bvhcnt 0x0 4593; GFX12-NEXT: s_wait_kmcnt 0x0 4594; GFX12-NEXT: v_mov_b32_e32 v1, 1 4595; GFX12-NEXT: s_wait_storecnt 0x0 4596; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS 4597; GFX12-NEXT: s_wait_storecnt 0x0 4598; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS 4599; GFX12-NEXT: s_wait_loadcnt 0x0 4600; GFX12-NEXT: s_setpc_b64 s[30:31] 4601; 4602; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 4603; GFX9-PAL: ; %bb.0: ; %bb 4604; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4605; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4606; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4607; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4608; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4609; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4610; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4611; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4612; 4613; GFX940-LABEL: store_load_i32_large_negative_unaligned: 4614; GFX940: ; %bb.0: ; %bb 4615; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4616; GFX940-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4617; GFX940-NEXT: v_mov_b32_e32 v1, 1 4618; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 4619; GFX940-NEXT: s_waitcnt vmcnt(0) 4620; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 4621; GFX940-NEXT: s_waitcnt vmcnt(0) 4622; GFX940-NEXT: s_setpc_b64 s[30:31] 4623; 4624; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 4625; GFX1010-PAL: ; %bb.0: ; %bb 4626; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4627; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 4628; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4629; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 4630; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4631; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 4632; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4633; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4634; 4635; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 4636; GFX1030-PAL: ; %bb.0: ; %bb 4637; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4638; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4639; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4640; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 4641; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4642; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4643; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4644; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4645; 4646; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned: 4647; GFX11-PAL: ; %bb.0: ; %bb 4648; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4649; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 4650; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4651; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4652; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4653; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4654; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4655; 4656; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned: 4657; GFX12-PAL: ; %bb.0: ; %bb 4658; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 4659; GFX12-PAL-NEXT: s_wait_expcnt 0x0 4660; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 4661; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 4662; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 4663; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 4664; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4665; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS 4666; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4667; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS 4668; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4669; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] 4670bb: 4671 %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225 4672 store volatile i8 1, ptr addrspace(5) %ptr, align 1 4673 %load = load volatile i8, ptr addrspace(5) %ptr, align 1 4674 ret void 4675} 4676 4677define amdgpu_ps void @large_offset() { 4678; GFX9-LABEL: large_offset: 4679; GFX9: ; %bb.0: ; %bb 4680; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 4681; GFX9-NEXT: v_mov_b32_e32 v0, 0 4682; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 4683; GFX9-NEXT: v_mov_b32_e32 v1, v0 4684; GFX9-NEXT: v_mov_b32_e32 v2, v0 4685; GFX9-NEXT: v_mov_b32_e32 v3, v0 4686; GFX9-NEXT: s_mov_b32 s0, 0 4687; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:3024 4688; GFX9-NEXT: s_waitcnt vmcnt(0) 4689; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc 4690; GFX9-NEXT: s_waitcnt vmcnt(0) 4691; GFX9-NEXT: s_mov_b32 s0, 16 4692; GFX9-NEXT: ;;#ASMSTART 4693; GFX9-NEXT: ; use s0 4694; GFX9-NEXT: ;;#ASMEND 4695; GFX9-NEXT: s_movk_i32 s0, 0x810 4696; GFX9-NEXT: ;;#ASMSTART 4697; GFX9-NEXT: ; use s0 4698; GFX9-NEXT: ;;#ASMEND 4699; GFX9-NEXT: s_endpgm 4700; 4701; GFX10-LABEL: large_offset: 4702; GFX10: ; %bb.0: ; %bb 4703; GFX10-NEXT: s_add_u32 s0, s0, s2 4704; GFX10-NEXT: s_addc_u32 s1, s1, 0 4705; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 4706; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 4707; GFX10-NEXT: v_mov_b32_e32 v0, 0 4708; GFX10-NEXT: s_movk_i32 s0, 0x810 4709; GFX10-NEXT: s_add_i32 s1, s0, 0x3c0 4710; GFX10-NEXT: v_mov_b32_e32 v1, v0 4711; GFX10-NEXT: v_mov_b32_e32 v2, v0 4712; GFX10-NEXT: v_mov_b32_e32 v3, v0 4713; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 4714; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4715; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc 4716; GFX10-NEXT: s_waitcnt vmcnt(0) 4717; GFX10-NEXT: s_mov_b32 s1, 16 4718; GFX10-NEXT: ;;#ASMSTART 4719; GFX10-NEXT: ; use s1 4720; GFX10-NEXT: ;;#ASMEND 4721; GFX10-NEXT: ;;#ASMSTART 4722; GFX10-NEXT: ; use s0 4723; GFX10-NEXT: ;;#ASMEND 4724; GFX10-NEXT: s_endpgm 4725; 4726; GFX11-LABEL: large_offset: 4727; GFX11: ; %bb.0: ; %bb 4728; GFX11-NEXT: v_mov_b32_e32 v0, 0 4729; GFX11-NEXT: s_mov_b32 s0, 16 4730; GFX11-NEXT: s_movk_i32 s1, 0x810 4731; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4732; GFX11-NEXT: v_mov_b32_e32 v1, v0 4733; GFX11-NEXT: v_mov_b32_e32 v2, v0 4734; GFX11-NEXT: v_mov_b32_e32 v3, v0 4735; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4736; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4737; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4738; GFX11-NEXT: s_waitcnt vmcnt(0) 4739; GFX11-NEXT: ;;#ASMSTART 4740; GFX11-NEXT: ; use s0 4741; GFX11-NEXT: ;;#ASMEND 4742; GFX11-NEXT: ;;#ASMSTART 4743; GFX11-NEXT: ; use s1 4744; GFX11-NEXT: ;;#ASMEND 4745; GFX11-NEXT: s_endpgm 4746; 4747; GFX12-LABEL: large_offset: 4748; GFX12: ; %bb.0: ; %bb 4749; GFX12-NEXT: v_mov_b32_e32 v0, 0 4750; GFX12-NEXT: s_mov_b32 s0, 0 4751; GFX12-NEXT: s_movk_i32 s1, 0x800 4752; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4753; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 4754; GFX12-NEXT: v_mov_b32_e32 v3, v0 4755; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS 4756; GFX12-NEXT: s_wait_storecnt 0x0 4757; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS 4758; GFX12-NEXT: s_wait_loadcnt 0x0 4759; GFX12-NEXT: ;;#ASMSTART 4760; GFX12-NEXT: ; use s0 4761; GFX12-NEXT: ;;#ASMEND 4762; GFX12-NEXT: ;;#ASMSTART 4763; GFX12-NEXT: ; use s1 4764; GFX12-NEXT: ;;#ASMEND 4765; GFX12-NEXT: s_endpgm 4766; 4767; GFX9-PAL-LABEL: large_offset: 4768; GFX9-PAL: ; %bb.0: ; %bb 4769; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 4770; GFX9-PAL-NEXT: s_mov_b32 s2, s0 4771; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4772; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 4773; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 4774; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 4775; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 4776; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 4777; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4778; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 4779; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 4780; GFX9-PAL-NEXT: s_mov_b32 s0, 0 4781; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:3024 4782; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4783; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc 4784; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4785; GFX9-PAL-NEXT: s_mov_b32 s0, 16 4786; GFX9-PAL-NEXT: ;;#ASMSTART 4787; GFX9-PAL-NEXT: ; use s0 4788; GFX9-PAL-NEXT: ;;#ASMEND 4789; GFX9-PAL-NEXT: s_movk_i32 s0, 0x810 4790; GFX9-PAL-NEXT: ;;#ASMSTART 4791; GFX9-PAL-NEXT: ; use s0 4792; GFX9-PAL-NEXT: ;;#ASMEND 4793; GFX9-PAL-NEXT: s_endpgm 4794; 4795; GFX940-LABEL: large_offset: 4796; GFX940: ; %bb.0: ; %bb 4797; GFX940-NEXT: v_mov_b32_e32 v0, 0 4798; GFX940-NEXT: v_mov_b32_e32 v1, v0 4799; GFX940-NEXT: v_mov_b32_e32 v2, v0 4800; GFX940-NEXT: v_mov_b32_e32 v3, v0 4801; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 4802; GFX940-NEXT: s_waitcnt vmcnt(0) 4803; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 4804; GFX940-NEXT: s_waitcnt vmcnt(0) 4805; GFX940-NEXT: s_mov_b32 s0, 16 4806; GFX940-NEXT: ;;#ASMSTART 4807; GFX940-NEXT: ; use s0 4808; GFX940-NEXT: ;;#ASMEND 4809; GFX940-NEXT: s_movk_i32 s0, 0x810 4810; GFX940-NEXT: ;;#ASMSTART 4811; GFX940-NEXT: ; use s0 4812; GFX940-NEXT: ;;#ASMEND 4813; GFX940-NEXT: s_endpgm 4814; 4815; GFX1010-PAL-LABEL: large_offset: 4816; GFX1010-PAL: ; %bb.0: ; %bb 4817; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 4818; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 4819; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4820; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 4821; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4822; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s0 4823; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 4824; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 4825; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 4826; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 0 4827; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x810 4828; GFX1010-PAL-NEXT: s_add_i32 s1, s0, 0x3c0 4829; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, v0 4830; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, v0 4831; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, v0 4832; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 4833; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4834; GFX1010-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc 4835; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4836; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 4837; GFX1010-PAL-NEXT: s_mov_b32 s1, 16 4838; GFX1010-PAL-NEXT: ;;#ASMSTART 4839; GFX1010-PAL-NEXT: ; use s1 4840; GFX1010-PAL-NEXT: ;;#ASMEND 4841; GFX1010-PAL-NEXT: ;;#ASMSTART 4842; GFX1010-PAL-NEXT: ; use s0 4843; GFX1010-PAL-NEXT: ;;#ASMEND 4844; GFX1010-PAL-NEXT: s_endpgm 4845; 4846; GFX1030-PAL-LABEL: large_offset: 4847; GFX1030-PAL: ; %bb.0: ; %bb 4848; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 4849; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 4850; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4851; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 4852; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4853; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s0 4854; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 4855; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 4856; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 4857; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 0 4858; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x810 4859; GFX1030-PAL-NEXT: s_add_i32 s1, s0, 0x3c0 4860; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, v0 4861; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, v0 4862; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, v0 4863; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 4864; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4865; GFX1030-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc 4866; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4867; GFX1030-PAL-NEXT: s_mov_b32 s1, 16 4868; GFX1030-PAL-NEXT: ;;#ASMSTART 4869; GFX1030-PAL-NEXT: ; use s1 4870; GFX1030-PAL-NEXT: ;;#ASMEND 4871; GFX1030-PAL-NEXT: ;;#ASMSTART 4872; GFX1030-PAL-NEXT: ; use s0 4873; GFX1030-PAL-NEXT: ;;#ASMEND 4874; GFX1030-PAL-NEXT: s_endpgm 4875; 4876; GFX11-PAL-LABEL: large_offset: 4877; GFX11-PAL: ; %bb.0: ; %bb 4878; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 4879; GFX11-PAL-NEXT: s_mov_b32 s0, 16 4880; GFX11-PAL-NEXT: s_movk_i32 s1, 0x810 4881; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4882; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 4883; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 4884; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0 4885; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4886; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4887; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4888; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4889; GFX11-PAL-NEXT: ;;#ASMSTART 4890; GFX11-PAL-NEXT: ; use s0 4891; GFX11-PAL-NEXT: ;;#ASMEND 4892; GFX11-PAL-NEXT: ;;#ASMSTART 4893; GFX11-PAL-NEXT: ; use s1 4894; GFX11-PAL-NEXT: ;;#ASMEND 4895; GFX11-PAL-NEXT: s_endpgm 4896; 4897; GFX12-PAL-LABEL: large_offset: 4898; GFX12-PAL: ; %bb.0: ; %bb 4899; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 0 4900; GFX12-PAL-NEXT: s_mov_b32 s0, 0 4901; GFX12-PAL-NEXT: s_movk_i32 s1, 0x800 4902; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4903; GFX12-PAL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 4904; GFX12-PAL-NEXT: v_mov_b32_e32 v3, v0 4905; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS 4906; GFX12-PAL-NEXT: s_wait_storecnt 0x0 4907; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS 4908; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 4909; GFX12-PAL-NEXT: ;;#ASMSTART 4910; GFX12-PAL-NEXT: ; use s0 4911; GFX12-PAL-NEXT: ;;#ASMEND 4912; GFX12-PAL-NEXT: ;;#ASMSTART 4913; GFX12-PAL-NEXT: ; use s1 4914; GFX12-PAL-NEXT: ;;#ASMEND 4915; GFX12-PAL-NEXT: s_endpgm 4916bb: 4917 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) 4918 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) 4919 %gep = getelementptr inbounds [128 x <4 x i32>], ptr addrspace(5) %alloca2, i32 0, i32 60 4920 store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %gep, align 16 4921 %load = load volatile <4 x i32>, ptr addrspace(5) %gep, align 16 4922 call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca) #0 4923 call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca2) #0 4924 ret void 4925} 4926 4927define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { 4928; GFX9-LABEL: sgpr_base_large_offset: 4929; GFX9: ; %bb.0: ; %entry 4930; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 4931; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 4932; GFX9-NEXT: s_add_i32 s2, s2, 0xffe8 4933; GFX9-NEXT: scratch_load_dword v2, off, s2 4934; GFX9-NEXT: s_waitcnt vmcnt(0) 4935; GFX9-NEXT: global_store_dword v[0:1], v2, off 4936; GFX9-NEXT: s_endpgm 4937; 4938; GFX10-LABEL: sgpr_base_large_offset: 4939; GFX10: ; %bb.0: ; %entry 4940; GFX10-NEXT: s_add_u32 s0, s0, s5 4941; GFX10-NEXT: s_addc_u32 s1, s1, 0 4942; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 4943; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 4944; GFX10-NEXT: s_add_i32 s2, s2, 0xffe8 4945; GFX10-NEXT: scratch_load_dword v2, off, s2 4946; GFX10-NEXT: s_waitcnt vmcnt(0) 4947; GFX10-NEXT: global_store_dword v[0:1], v2, off 4948; GFX10-NEXT: s_endpgm 4949; 4950; GFX11-LABEL: sgpr_base_large_offset: 4951; GFX11: ; %bb.0: ; %entry 4952; GFX11-NEXT: s_add_i32 s0, s0, 0xffe8 4953; GFX11-NEXT: scratch_load_b32 v2, off, s0 4954; GFX11-NEXT: s_waitcnt vmcnt(0) 4955; GFX11-NEXT: global_store_b32 v[0:1], v2, off 4956; GFX11-NEXT: s_endpgm 4957; 4958; GFX12-LABEL: sgpr_base_large_offset: 4959; GFX12: ; %bb.0: ; %entry 4960; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512 4961; GFX12-NEXT: s_wait_loadcnt 0x0 4962; GFX12-NEXT: global_store_b32 v[0:1], v2, off 4963; GFX12-NEXT: s_endpgm 4964; 4965; GFX9-PAL-LABEL: sgpr_base_large_offset: 4966; GFX9-PAL: ; %bb.0: ; %entry 4967; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 4968; GFX9-PAL-NEXT: s_mov_b32 s2, s8 4969; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4970; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 4971; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4972; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 4973; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 4974; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0xffe8 4975; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 4976; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4977; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off 4978; GFX9-PAL-NEXT: s_endpgm 4979; 4980; GFX940-LABEL: sgpr_base_large_offset: 4981; GFX940: ; %bb.0: ; %entry 4982; GFX940-NEXT: s_add_i32 s0, s0, 0xffe8 4983; GFX940-NEXT: scratch_load_dword v2, off, s0 4984; GFX940-NEXT: s_waitcnt vmcnt(0) 4985; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 4986; GFX940-NEXT: s_endpgm 4987; 4988; GFX10-PAL-LABEL: sgpr_base_large_offset: 4989; GFX10-PAL: ; %bb.0: ; %entry 4990; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 4991; GFX10-PAL-NEXT: s_mov_b32 s2, s8 4992; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4993; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 4994; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4995; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 4996; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 4997; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 4998; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 4999; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0xffe8 5000; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 5001; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 5002; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off 5003; GFX10-PAL-NEXT: s_endpgm 5004; 5005; GFX11-PAL-LABEL: sgpr_base_large_offset: 5006; GFX11-PAL: ; %bb.0: ; %entry 5007; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0xffe8 5008; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0 5009; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 5010; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off 5011; GFX11-PAL-NEXT: s_endpgm 5012; 5013; GFX12-PAL-LABEL: sgpr_base_large_offset: 5014; GFX12-PAL: ; %bb.0: ; %entry 5015; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:65512 5016; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 5017; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off 5018; GFX12-PAL-NEXT: s_endpgm 5019entry: 5020 %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 5021 %load = load i32, ptr addrspace(5) %large_offset, align 4 5022 store i32 %load, ptr addrspace(1) %out 5023 ret void 5024} 5025 5026define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { 5027; GFX9-LABEL: sgpr_base_large_offset_split: 5028; GFX9: ; %bb.0: ; %entry 5029; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 5030; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 5031; GFX9-NEXT: s_and_b32 s0, s2, -4 5032; GFX9-NEXT: s_add_i32 s0, s0, 0x100f000 5033; GFX9-NEXT: scratch_load_dword v2, off, s0 offset:4072 glc 5034; GFX9-NEXT: s_waitcnt vmcnt(0) 5035; GFX9-NEXT: global_store_dword v[0:1], v2, off 5036; GFX9-NEXT: s_endpgm 5037; 5038; GFX10-LABEL: sgpr_base_large_offset_split: 5039; GFX10: ; %bb.0: ; %entry 5040; GFX10-NEXT: s_add_u32 s0, s0, s5 5041; GFX10-NEXT: s_addc_u32 s1, s1, 0 5042; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 5043; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 5044; GFX10-NEXT: s_and_b32 s0, s2, -4 5045; GFX10-NEXT: s_add_i32 s0, s0, 0x100f800 5046; GFX10-NEXT: scratch_load_dword v2, off, s0 offset:2024 glc dlc 5047; GFX10-NEXT: s_waitcnt vmcnt(0) 5048; GFX10-NEXT: global_store_dword v[0:1], v2, off 5049; GFX10-NEXT: s_endpgm 5050; 5051; GFX11-LABEL: sgpr_base_large_offset_split: 5052; GFX11: ; %bb.0: ; %entry 5053; GFX11-NEXT: v_mov_b32_e32 v2, 0x100f000 5054; GFX11-NEXT: s_and_b32 s0, s0, -4 5055; GFX11-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc 5056; GFX11-NEXT: s_waitcnt vmcnt(0) 5057; GFX11-NEXT: global_store_b32 v[0:1], v2, off 5058; GFX11-NEXT: s_endpgm 5059; 5060; GFX12-LABEL: sgpr_base_large_offset_split: 5061; GFX12: ; %bb.0: ; %entry 5062; GFX12-NEXT: v_mov_b32_e32 v2, 0x1000000 5063; GFX12-NEXT: s_and_b32 s0, s0, -4 5064; GFX12-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS 5065; GFX12-NEXT: s_wait_loadcnt 0x0 5066; GFX12-NEXT: global_store_b32 v[0:1], v2, off 5067; GFX12-NEXT: s_endpgm 5068; 5069; GFX9-PAL-LABEL: sgpr_base_large_offset_split: 5070; GFX9-PAL: ; %bb.0: ; %entry 5071; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 5072; GFX9-PAL-NEXT: s_mov_b32 s2, s8 5073; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5074; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 5075; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5076; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 5077; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 5078; GFX9-PAL-NEXT: s_and_b32 s0, s0, -4 5079; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0x100f000 5080; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 offset:4072 glc 5081; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 5082; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off 5083; GFX9-PAL-NEXT: s_endpgm 5084; 5085; GFX940-LABEL: sgpr_base_large_offset_split: 5086; GFX940: ; %bb.0: ; %entry 5087; GFX940-NEXT: s_and_b32 s0, s0, -4 5088; GFX940-NEXT: v_mov_b32_e32 v2, 0x100f000 5089; GFX940-NEXT: scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1 5090; GFX940-NEXT: s_waitcnt vmcnt(0) 5091; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 5092; GFX940-NEXT: s_endpgm 5093; 5094; GFX10-PAL-LABEL: sgpr_base_large_offset_split: 5095; GFX10-PAL: ; %bb.0: ; %entry 5096; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 5097; GFX10-PAL-NEXT: s_mov_b32 s2, s8 5098; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5099; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 5100; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5101; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 5102; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 5103; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 5104; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 5105; GFX10-PAL-NEXT: s_and_b32 s0, s0, -4 5106; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0x100f800 5107; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 offset:2024 glc dlc 5108; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 5109; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off 5110; GFX10-PAL-NEXT: s_endpgm 5111; 5112; GFX11-PAL-LABEL: sgpr_base_large_offset_split: 5113; GFX11-PAL: ; %bb.0: ; %entry 5114; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0x100f000 5115; GFX11-PAL-NEXT: s_and_b32 s0, s0, -4 5116; GFX11-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc 5117; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 5118; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off 5119; GFX11-PAL-NEXT: s_endpgm 5120; 5121; GFX12-PAL-LABEL: sgpr_base_large_offset_split: 5122; GFX12-PAL: ; %bb.0: ; %entry 5123; GFX12-PAL-NEXT: v_mov_b32_e32 v2, 0x1000000 5124; GFX12-PAL-NEXT: s_and_b32 s0, s0, -4 5125; GFX12-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS 5126; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 5127; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off 5128; GFX12-PAL-NEXT: s_endpgm 5129entry: 5130 ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5) 5131 %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32 5132 %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292 5133 %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5) 5134 %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728 5135 %load = load volatile i32, ptr addrspace(5) %split_offset, align 4 5136 store i32 %load, ptr addrspace(1) %out 5137 ret void 5138} 5139 5140define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) { 5141; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5142; GFX9: ; %bb.0: ; %bb 5143; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 5144; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 5145; GFX9-NEXT: s_add_i32 s2, s2, s3 5146; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 5147; GFX9-NEXT: v_add_u32_e32 v0, 0xffe8, v0 5148; GFX9-NEXT: v_mov_b32_e32 v1, 15 5149; GFX9-NEXT: scratch_store_dword v0, v1, off 5150; GFX9-NEXT: s_waitcnt vmcnt(0) 5151; GFX9-NEXT: s_endpgm 5152; 5153; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5154; GFX10: ; %bb.0: ; %bb 5155; GFX10-NEXT: s_add_u32 s0, s0, s5 5156; GFX10-NEXT: s_addc_u32 s1, s1, 0 5157; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 5158; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 5159; GFX10-NEXT: s_add_i32 s2, s2, s3 5160; GFX10-NEXT: v_mov_b32_e32 v1, 15 5161; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8 5162; GFX10-NEXT: scratch_store_dword v0, v1, off 5163; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5164; GFX10-NEXT: s_endpgm 5165; 5166; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5167; GFX11: ; %bb.0: ; %bb 5168; GFX11-NEXT: s_add_i32 s0, s0, s1 5169; GFX11-NEXT: v_mov_b32_e32 v1, 15 5170; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 5171; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc 5172; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5173; GFX11-NEXT: s_endpgm 5174; 5175; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5176; GFX12: ; %bb.0: ; %bb 5177; GFX12-NEXT: v_mov_b32_e32 v1, 15 5178; GFX12-NEXT: s_add_co_i32 s0, s0, s1 5179; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS 5180; GFX12-NEXT: s_wait_storecnt 0x0 5181; GFX12-NEXT: s_endpgm 5182; 5183; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5184; GFX9-PAL: ; %bb.0: ; %bb 5185; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 5186; GFX9-PAL-NEXT: s_mov_b32 s2, s8 5187; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5188; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 5189; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 5190; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5191; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 5192; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 5193; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1 5194; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 5195; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffe8, v0 5196; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off 5197; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 5198; GFX9-PAL-NEXT: s_endpgm 5199; 5200; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5201; GFX940: ; %bb.0: ; %bb 5202; GFX940-NEXT: s_add_i32 s0, s0, s1 5203; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 5204; GFX940-NEXT: v_add_u32_e32 v0, 0xffe8, v0 5205; GFX940-NEXT: v_mov_b32_e32 v1, 15 5206; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 5207; GFX940-NEXT: s_waitcnt vmcnt(0) 5208; GFX940-NEXT: s_endpgm 5209; 5210; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5211; GFX10-PAL: ; %bb.0: ; %bb 5212; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 5213; GFX10-PAL-NEXT: s_mov_b32 s2, s8 5214; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5215; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 5216; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5217; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 5218; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 5219; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 5220; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 5221; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1 5222; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 5223; GFX10-PAL-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 5224; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 5225; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 5226; GFX10-PAL-NEXT: s_endpgm 5227; 5228; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5229; GFX11-PAL: ; %bb.0: ; %bb 5230; GFX11-PAL-NEXT: s_add_i32 s0, s0, s1 5231; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 5232; GFX11-PAL-NEXT: v_add3_u32 v0, s0, v0, 0xffe8 5233; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc 5234; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 5235; GFX11-PAL-NEXT: s_endpgm 5236; 5237; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: 5238; GFX12-PAL: ; %bb.0: ; %bb 5239; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 5240; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, s1 5241; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS 5242; GFX12-PAL-NEXT: s_wait_storecnt 0x0 5243; GFX12-PAL-NEXT: s_endpgm 5244bb: 5245 %add1 = add nsw i32 %sidx, %vidx 5246 %add2 = add nsw i32 %add1, 65512 5247 %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2 5248 store volatile i32 15, ptr addrspace(5) %gep, align 4 5249 ret void 5250} 5251 5252define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) { 5253; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5254; GFX9: ; %bb.0: ; %bb 5255; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 5256; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 5257; GFX9-NEXT: s_add_i32 s2, s2, s3 5258; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 5259; GFX9-NEXT: v_add_u32_e32 v0, -16, v0 5260; GFX9-NEXT: v_mov_b32_e32 v1, 15 5261; GFX9-NEXT: scratch_store_dword v0, v1, off 5262; GFX9-NEXT: s_waitcnt vmcnt(0) 5263; GFX9-NEXT: s_endpgm 5264; 5265; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5266; GFX10: ; %bb.0: ; %bb 5267; GFX10-NEXT: s_add_u32 s0, s0, s5 5268; GFX10-NEXT: s_addc_u32 s1, s1, 0 5269; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 5270; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 5271; GFX10-NEXT: v_add3_u32 v0, s2, s3, v0 5272; GFX10-NEXT: v_mov_b32_e32 v1, 15 5273; GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16 5274; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5275; GFX10-NEXT: s_endpgm 5276; 5277; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5278; GFX11: ; %bb.0: ; %bb 5279; GFX11-NEXT: v_add3_u32 v0, s0, s1, v0 5280; GFX11-NEXT: v_mov_b32_e32 v1, 15 5281; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc 5282; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5283; GFX11-NEXT: s_endpgm 5284; 5285; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5286; GFX12: ; %bb.0: ; %bb 5287; GFX12-NEXT: v_mov_b32_e32 v1, 15 5288; GFX12-NEXT: s_add_co_i32 s0, s0, s1 5289; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS 5290; GFX12-NEXT: s_wait_storecnt 0x0 5291; GFX12-NEXT: s_endpgm 5292; 5293; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5294; GFX9-PAL: ; %bb.0: ; %bb 5295; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 5296; GFX9-PAL-NEXT: s_mov_b32 s2, s8 5297; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5298; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 5299; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 5300; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5301; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 5302; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 5303; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1 5304; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 5305; GFX9-PAL-NEXT: v_add_u32_e32 v0, -16, v0 5306; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off 5307; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 5308; GFX9-PAL-NEXT: s_endpgm 5309; 5310; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5311; GFX940: ; %bb.0: ; %bb 5312; GFX940-NEXT: s_add_i32 s0, s0, s1 5313; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 5314; GFX940-NEXT: v_add_u32_e32 v0, -16, v0 5315; GFX940-NEXT: v_mov_b32_e32 v1, 15 5316; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 5317; GFX940-NEXT: s_waitcnt vmcnt(0) 5318; GFX940-NEXT: s_endpgm 5319; 5320; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5321; GFX10-PAL: ; %bb.0: ; %bb 5322; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 5323; GFX10-PAL-NEXT: s_mov_b32 s2, s8 5324; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5325; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 5326; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5327; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 5328; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 5329; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 5330; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 5331; GFX10-PAL-NEXT: v_add3_u32 v0, s0, s1, v0 5332; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 5333; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:-16 5334; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 5335; GFX10-PAL-NEXT: s_endpgm 5336; 5337; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5338; GFX11-PAL: ; %bb.0: ; %bb 5339; GFX11-PAL-NEXT: v_add3_u32 v0, s0, s1, v0 5340; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 5341; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc 5342; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 5343; GFX11-PAL-NEXT: s_endpgm 5344; 5345; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: 5346; GFX12-PAL: ; %bb.0: ; %bb 5347; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 5348; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, s1 5349; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS 5350; GFX12-PAL-NEXT: s_wait_storecnt 0x0 5351; GFX12-PAL-NEXT: s_endpgm 5352bb: 5353 %add1 = add nsw i32 %sidx, %vidx 5354 %add2 = add nsw i32 %add1, -16 5355 %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2 5356 store volatile i32 15, ptr addrspace(5) %gep, align 4 5357 ret void 5358} 5359 5360define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) { 5361; GFX9-LABEL: sgpr_base_negative_offset: 5362; GFX9: ; %bb.0: ; %entry 5363; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 5364; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 5365; GFX9-NEXT: s_addk_i32 s2, 0xffe8 5366; GFX9-NEXT: scratch_load_dword v2, off, s2 5367; GFX9-NEXT: s_waitcnt vmcnt(0) 5368; GFX9-NEXT: global_store_dword v[0:1], v2, off 5369; GFX9-NEXT: s_endpgm 5370; 5371; GFX10-LABEL: sgpr_base_negative_offset: 5372; GFX10: ; %bb.0: ; %entry 5373; GFX10-NEXT: s_add_u32 s0, s0, s5 5374; GFX10-NEXT: s_addc_u32 s1, s1, 0 5375; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 5376; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 5377; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24 5378; GFX10-NEXT: s_waitcnt vmcnt(0) 5379; GFX10-NEXT: global_store_dword v[0:1], v2, off 5380; GFX10-NEXT: s_endpgm 5381; 5382; GFX11-LABEL: sgpr_base_negative_offset: 5383; GFX11: ; %bb.0: ; %entry 5384; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24 5385; GFX11-NEXT: s_waitcnt vmcnt(0) 5386; GFX11-NEXT: global_store_b32 v[0:1], v2, off 5387; GFX11-NEXT: s_endpgm 5388; 5389; GFX12-LABEL: sgpr_base_negative_offset: 5390; GFX12: ; %bb.0: ; %entry 5391; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24 5392; GFX12-NEXT: s_wait_loadcnt 0x0 5393; GFX12-NEXT: global_store_b32 v[0:1], v2, off 5394; GFX12-NEXT: s_endpgm 5395; 5396; GFX9-PAL-LABEL: sgpr_base_negative_offset: 5397; GFX9-PAL: ; %bb.0: ; %entry 5398; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 5399; GFX9-PAL-NEXT: s_mov_b32 s2, s8 5400; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5401; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 5402; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5403; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 5404; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 5405; GFX9-PAL-NEXT: s_addk_i32 s0, 0xffe8 5406; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 5407; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 5408; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off 5409; GFX9-PAL-NEXT: s_endpgm 5410; 5411; GFX940-LABEL: sgpr_base_negative_offset: 5412; GFX940: ; %bb.0: ; %entry 5413; GFX940-NEXT: s_addk_i32 s0, 0xffe8 5414; GFX940-NEXT: scratch_load_dword v2, off, s0 5415; GFX940-NEXT: s_waitcnt vmcnt(0) 5416; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 5417; GFX940-NEXT: s_endpgm 5418; 5419; GFX10-PAL-LABEL: sgpr_base_negative_offset: 5420; GFX10-PAL: ; %bb.0: ; %entry 5421; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 5422; GFX10-PAL-NEXT: s_mov_b32 s2, s8 5423; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 5424; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 5425; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 5426; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 5427; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 5428; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 5429; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 5430; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 offset:-24 5431; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 5432; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off 5433; GFX10-PAL-NEXT: s_endpgm 5434; 5435; GFX11-PAL-LABEL: sgpr_base_negative_offset: 5436; GFX11-PAL: ; %bb.0: ; %entry 5437; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:-24 5438; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 5439; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off 5440; GFX11-PAL-NEXT: s_endpgm 5441; 5442; GFX12-PAL-LABEL: sgpr_base_negative_offset: 5443; GFX12-PAL: ; %bb.0: ; %entry 5444; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:-24 5445; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 5446; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off 5447; GFX12-PAL-NEXT: s_endpgm 5448entry: 5449 %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24 5450 %0 = load i32, ptr addrspace(5) %scevgep28, align 4 5451 store i32 %0, ptr addrspace(1) %out 5452 ret void 5453} 5454 5455declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) 5456declare i32 @llvm.amdgcn.workitem.id.x() 5457