1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s 5 6; vgpr offset 7 8define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %out) { 9; GFX10-LABEL: test_scratch_load_i8_zext_v: 10; GFX10: ; %bb.0: 11; GFX10-NEXT: s_add_u32 s0, s0, s2 12; GFX10-NEXT: s_addc_u32 s1, s1, 0 13; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 14; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 15; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 16; GFX10-NEXT: s_waitcnt vmcnt(0) 17; GFX10-NEXT: flat_store_dword v[1:2], v0 18; GFX10-NEXT: s_endpgm 19; 20; GFX11-LABEL: test_scratch_load_i8_zext_v: 21; GFX11: ; %bb.0: 22; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1 23; GFX11-NEXT: s_waitcnt vmcnt(0) 24; GFX11-NEXT: flat_store_b32 v[1:2], v0 25; GFX11-NEXT: s_endpgm 26; 27; GFX12-LABEL: test_scratch_load_i8_zext_v: 28; GFX12: ; %bb.0: 29; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1 30; GFX12-NEXT: s_wait_loadcnt 0x0 31; GFX12-NEXT: flat_store_b32 v[1:2], v0 32; GFX12-NEXT: s_endpgm 33 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 34 %load = load i8, ptr addrspace(5) %gep, align 4 35 %ext = zext i8 %load to i32 36 store i32 %ext, ptr %out, align 4 37 ret void 38} 39 40define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %out) { 41; GFX10-LABEL: test_scratch_load_i8_sext_v: 42; GFX10: ; %bb.0: 43; GFX10-NEXT: s_add_u32 s0, s0, s2 44; GFX10-NEXT: s_addc_u32 s1, s1, 0 45; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 46; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 47; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1 48; GFX10-NEXT: s_waitcnt vmcnt(0) 49; GFX10-NEXT: flat_store_dword v[1:2], v0 50; GFX10-NEXT: s_endpgm 51; 52; GFX11-LABEL: test_scratch_load_i8_sext_v: 53; GFX11: ; %bb.0: 54; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1 55; GFX11-NEXT: s_waitcnt vmcnt(0) 56; GFX11-NEXT: flat_store_b32 v[1:2], v0 57; GFX11-NEXT: s_endpgm 58; 59; GFX12-LABEL: test_scratch_load_i8_sext_v: 60; GFX12: ; %bb.0: 61; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1 62; GFX12-NEXT: s_wait_loadcnt 0x0 63; GFX12-NEXT: flat_store_b32 v[1:2], v0 64; GFX12-NEXT: s_endpgm 65 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 66 %load = load i8, ptr addrspace(5) %gep, align 4 67 %ext = sext i8 %load to i32 68 store i32 %ext, ptr %out, align 4 69 ret void 70} 71 72define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %out) { 73; GFX10-LABEL: test_scratch_load_i16_zext_v: 74; GFX10: ; %bb.0: 75; GFX10-NEXT: s_add_u32 s0, s0, s2 76; GFX10-NEXT: s_addc_u32 s1, s1, 0 77; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 78; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 79; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2 80; GFX10-NEXT: s_waitcnt vmcnt(0) 81; GFX10-NEXT: flat_store_dword v[1:2], v0 82; GFX10-NEXT: s_endpgm 83; 84; GFX11-LABEL: test_scratch_load_i16_zext_v: 85; GFX11: ; %bb.0: 86; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 87; GFX11-NEXT: s_waitcnt vmcnt(0) 88; GFX11-NEXT: flat_store_b32 v[1:2], v0 89; GFX11-NEXT: s_endpgm 90; 91; GFX12-LABEL: test_scratch_load_i16_zext_v: 92; GFX12: ; %bb.0: 93; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2 94; GFX12-NEXT: s_wait_loadcnt 0x0 95; GFX12-NEXT: flat_store_b32 v[1:2], v0 96; GFX12-NEXT: s_endpgm 97 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 98 %load = load i16, ptr addrspace(5) %gep, align 4 99 %ext = zext i16 %load to i32 100 store i32 %ext, ptr %out, align 4 101 ret void 102} 103 104define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %out) { 105; GFX10-LABEL: test_scratch_load_i16_sext_v: 106; GFX10: ; %bb.0: 107; GFX10-NEXT: s_add_u32 s0, s0, s2 108; GFX10-NEXT: s_addc_u32 s1, s1, 0 109; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 110; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 111; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2 112; GFX10-NEXT: s_waitcnt vmcnt(0) 113; GFX10-NEXT: flat_store_dword v[1:2], v0 114; GFX10-NEXT: s_endpgm 115; 116; GFX11-LABEL: test_scratch_load_i16_sext_v: 117; GFX11: ; %bb.0: 118; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2 119; GFX11-NEXT: s_waitcnt vmcnt(0) 120; GFX11-NEXT: flat_store_b32 v[1:2], v0 121; GFX11-NEXT: s_endpgm 122; 123; GFX12-LABEL: test_scratch_load_i16_sext_v: 124; GFX12: ; %bb.0: 125; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2 126; GFX12-NEXT: s_wait_loadcnt 0x0 127; GFX12-NEXT: flat_store_b32 v[1:2], v0 128; GFX12-NEXT: s_endpgm 129 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 130 %load = load i16, ptr addrspace(5) %gep, align 4 131 %ext = sext i16 %load to i32 132 store i32 %ext, ptr %out, align 4 133 ret void 134} 135 136define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) { 137; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_v: 138; GFX10: ; %bb.0: ; %bb 139; GFX10-NEXT: s_add_u32 s0, s0, s2 140; GFX10-NEXT: s_addc_u32 s1, s1, 0 141; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 142; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 143; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 144; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000 145; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off 146; GFX10-NEXT: s_waitcnt vmcnt(0) 147; GFX10-NEXT: flat_store_dword v[1:2], v3 148; GFX10-NEXT: s_endpgm 149; 150; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v: 151; GFX11: ; %bb.0: ; %bb 152; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0 153; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off 154; GFX11-NEXT: s_waitcnt vmcnt(0) 155; GFX11-NEXT: flat_store_b32 v[1:2], v3 156; GFX11-NEXT: s_endpgm 157; 158; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_v: 159; GFX12: ; %bb.0: ; %bb 160; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 161; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1 162; GFX12-NEXT: s_wait_loadcnt 0x0 163; GFX12-NEXT: flat_store_b32 v[1:2], v3 164; GFX12-NEXT: s_endpgm 165bb: 166 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 167 %load_lo = load i8, ptr addrspace(5) %gep 168 %ext = zext i8 %load_lo to i16 169 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0 170 store <2 x i16> %result, ptr %out, align 4 171 ret void 172} 173 174define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) { 175; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_v: 176; GFX10: ; %bb.0: ; %bb 177; GFX10-NEXT: s_add_u32 s0, s0, s2 178; GFX10-NEXT: s_addc_u32 s1, s1, 0 179; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 180; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 181; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 182; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000 183; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off 184; GFX10-NEXT: s_waitcnt vmcnt(0) 185; GFX10-NEXT: flat_store_dword v[1:2], v3 186; GFX10-NEXT: s_endpgm 187; 188; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v: 189; GFX11: ; %bb.0: ; %bb 190; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0 191; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off 192; GFX11-NEXT: s_waitcnt vmcnt(0) 193; GFX11-NEXT: flat_store_b32 v[1:2], v3 194; GFX11-NEXT: s_endpgm 195; 196; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_v: 197; GFX12: ; %bb.0: ; %bb 198; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 199; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1 200; GFX12-NEXT: s_wait_loadcnt 0x0 201; GFX12-NEXT: flat_store_b32 v[1:2], v3 202; GFX12-NEXT: s_endpgm 203bb: 204 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 205 %load_lo = load i8, ptr addrspace(5) %gep 206 %ext = sext i8 %load_lo to i16 207 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0 208 store <2 x i16> %result, ptr %out, align 4 209 ret void 210} 211 212define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) { 213; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_v: 214; GFX10: ; %bb.0: ; %bb 215; GFX10-NEXT: s_add_u32 s0, s0, s2 216; GFX10-NEXT: s_addc_u32 s1, s1, 0 217; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 218; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 219; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 220; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000 221; GFX10-NEXT: scratch_load_short_d16 v3, v0, off 222; GFX10-NEXT: s_waitcnt vmcnt(0) 223; GFX10-NEXT: flat_store_dword v[1:2], v3 224; GFX10-NEXT: s_endpgm 225; 226; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v: 227; GFX11: ; %bb.0: ; %bb 228; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0 229; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off 230; GFX11-NEXT: s_waitcnt vmcnt(0) 231; GFX11-NEXT: flat_store_b32 v[1:2], v3 232; GFX11-NEXT: s_endpgm 233; 234; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_v: 235; GFX12: ; %bb.0: ; %bb 236; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 237; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2 238; GFX12-NEXT: s_wait_loadcnt 0x0 239; GFX12-NEXT: flat_store_b32 v[1:2], v3 240; GFX12-NEXT: s_endpgm 241bb: 242 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 243 %load_lo = load i16, ptr addrspace(5) %gep 244 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0 245 store <2 x i16> %result, ptr %out, align 4 246 ret void 247} 248 249 250define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) { 251; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_v: 252; GFX10: ; %bb.0: ; %bb 253; GFX10-NEXT: s_add_u32 s0, s0, s2 254; GFX10-NEXT: s_addc_u32 s1, s1, 0 255; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 256; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 257; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 258; GFX10-NEXT: v_mov_b32_e32 v3, -1 259; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off 260; GFX10-NEXT: s_waitcnt vmcnt(0) 261; GFX10-NEXT: flat_store_dword v[1:2], v3 262; GFX10-NEXT: s_endpgm 263; 264; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v: 265; GFX11: ; %bb.0: ; %bb 266; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0 267; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off 268; GFX11-NEXT: s_waitcnt vmcnt(0) 269; GFX11-NEXT: flat_store_b32 v[1:2], v3 270; GFX11-NEXT: s_endpgm 271; 272; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_v: 273; GFX12: ; %bb.0: ; %bb 274; GFX12-NEXT: v_mov_b32_e32 v3, -1 275; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1 276; GFX12-NEXT: s_wait_loadcnt 0x0 277; GFX12-NEXT: flat_store_b32 v[1:2], v3 278; GFX12-NEXT: s_endpgm 279bb: 280 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 281 %load_lo = load i8, ptr addrspace(5) %gep 282 %ext = zext i8 %load_lo to i16 283 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1 284 store <2 x i16> %result, ptr %out, align 4 285 ret void 286} 287 288define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) { 289; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_v: 290; GFX10: ; %bb.0: ; %bb 291; GFX10-NEXT: s_add_u32 s0, s0, s2 292; GFX10-NEXT: s_addc_u32 s1, s1, 0 293; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 294; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 295; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 296; GFX10-NEXT: v_mov_b32_e32 v3, -1 297; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off 298; GFX10-NEXT: s_waitcnt vmcnt(0) 299; GFX10-NEXT: flat_store_dword v[1:2], v3 300; GFX10-NEXT: s_endpgm 301; 302; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v: 303; GFX11: ; %bb.0: ; %bb 304; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0 305; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off 306; GFX11-NEXT: s_waitcnt vmcnt(0) 307; GFX11-NEXT: flat_store_b32 v[1:2], v3 308; GFX11-NEXT: s_endpgm 309; 310; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_v: 311; GFX12: ; %bb.0: ; %bb 312; GFX12-NEXT: v_mov_b32_e32 v3, -1 313; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1 314; GFX12-NEXT: s_wait_loadcnt 0x0 315; GFX12-NEXT: flat_store_b32 v[1:2], v3 316; GFX12-NEXT: s_endpgm 317bb: 318 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 319 %load_lo = load i8, ptr addrspace(5) %gep 320 %ext = sext i8 %load_lo to i16 321 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1 322 store <2 x i16> %result, ptr %out, align 4 323 ret void 324} 325 326define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) { 327; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_v: 328; GFX10: ; %bb.0: ; %bb 329; GFX10-NEXT: s_add_u32 s0, s0, s2 330; GFX10-NEXT: s_addc_u32 s1, s1, 0 331; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 332; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 333; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 334; GFX10-NEXT: v_mov_b32_e32 v3, -1 335; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off 336; GFX10-NEXT: s_waitcnt vmcnt(0) 337; GFX10-NEXT: flat_store_dword v[1:2], v3 338; GFX10-NEXT: s_endpgm 339; 340; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v: 341; GFX11: ; %bb.0: ; %bb 342; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0 343; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off 344; GFX11-NEXT: s_waitcnt vmcnt(0) 345; GFX11-NEXT: flat_store_b32 v[1:2], v3 346; GFX11-NEXT: s_endpgm 347; 348; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_v: 349; GFX12: ; %bb.0: ; %bb 350; GFX12-NEXT: v_mov_b32_e32 v3, -1 351; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2 352; GFX12-NEXT: s_wait_loadcnt 0x0 353; GFX12-NEXT: flat_store_b32 v[1:2], v3 354; GFX12-NEXT: s_endpgm 355bb: 356 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 357 %load_lo = load i16, ptr addrspace(5) %gep 358 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1 359 store <2 x i16> %result, ptr %out, align 4 360 ret void 361} 362 363define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) { 364; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_v: 365; GFX10: ; %bb.0: ; %bb 366; GFX10-NEXT: s_add_u32 s0, s0, s2 367; GFX10-NEXT: s_addc_u32 s1, s1, 0 368; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 369; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 370; GFX10-NEXT: flat_load_dword v0, v[0:1] 371; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v2 372; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 373; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off 374; GFX10-NEXT: s_endpgm 375; 376; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v: 377; GFX11: ; %bb.0: ; %bb 378; GFX11-NEXT: flat_load_b32 v0, v[0:1] 379; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v2 380; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 381; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off 382; GFX11-NEXT: s_endpgm 383; 384; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v: 385; GFX12: ; %bb.0: ; %bb 386; GFX12-NEXT: flat_load_b32 v0, v[0:1] 387; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 388; GFX12-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:4 389; GFX12-NEXT: s_endpgm 390bb: 391 %load = load <4 x i8>, ptr %in 392 %element = extractelement <4 x i8> %load, i32 2 393 %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1 394 store i8 %element, ptr addrspace(5) %gep, align 4 395 ret void 396} 397 398define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) { 399; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_v: 400; GFX10: ; %bb.0: ; %bb 401; GFX10-NEXT: s_add_u32 s0, s0, s2 402; GFX10-NEXT: s_addc_u32 s1, s1, 0 403; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 404; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 405; GFX10-NEXT: flat_load_dword v0, v[0:1] 406; GFX10-NEXT: v_add_nc_u32_e32 v1, 2, v2 407; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 408; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off 409; GFX10-NEXT: s_endpgm 410; 411; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v: 412; GFX11: ; %bb.0: ; %bb 413; GFX11-NEXT: flat_load_b32 v0, v[0:1] 414; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v2 415; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 416; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off 417; GFX11-NEXT: s_endpgm 418; 419; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v: 420; GFX12: ; %bb.0: ; %bb 421; GFX12-NEXT: flat_load_b32 v0, v[0:1] 422; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 423; GFX12-NEXT: scratch_store_d16_hi_b16 v2, v0, off offset:2 424; GFX12-NEXT: s_endpgm 425bb: 426 %load = load <2 x i16>, ptr %in 427 %element = extractelement <2 x i16> %load, i32 1 428 %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1 429 store i16 %element, ptr addrspace(5) %gep, align 4 430 ret void 431} 432 433 434 435 436; sgpr offset 437 438define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, ptr %out) { 439; GFX10-LABEL: test_scratch_load_i8_zext_s: 440; GFX10: ; %bb.0: 441; GFX10-NEXT: s_add_u32 s0, s0, s3 442; GFX10-NEXT: s_addc_u32 s1, s1, 0 443; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 444; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 445; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1 446; GFX10-NEXT: s_waitcnt vmcnt(0) 447; GFX10-NEXT: flat_store_dword v[0:1], v2 448; GFX10-NEXT: s_endpgm 449; 450; GFX11-LABEL: test_scratch_load_i8_zext_s: 451; GFX11: ; %bb.0: 452; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1 453; GFX11-NEXT: s_waitcnt vmcnt(0) 454; GFX11-NEXT: flat_store_b32 v[0:1], v2 455; GFX11-NEXT: s_endpgm 456; 457; GFX12-LABEL: test_scratch_load_i8_zext_s: 458; GFX12: ; %bb.0: 459; GFX12-NEXT: scratch_load_u8 v2, off, s0 offset:1 460; GFX12-NEXT: s_wait_loadcnt 0x0 461; GFX12-NEXT: flat_store_b32 v[0:1], v2 462; GFX12-NEXT: s_endpgm 463 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 464 %load = load i8, ptr addrspace(5) %gep, align 4 465 %ext = zext i8 %load to i32 466 store i32 %ext, ptr %out, align 4 467 ret void 468} 469 470define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, ptr %out) { 471; GFX10-LABEL: test_scratch_load_i8_sext_s: 472; GFX10: ; %bb.0: 473; GFX10-NEXT: s_add_u32 s0, s0, s3 474; GFX10-NEXT: s_addc_u32 s1, s1, 0 475; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 476; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 477; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1 478; GFX10-NEXT: s_waitcnt vmcnt(0) 479; GFX10-NEXT: flat_store_dword v[0:1], v2 480; GFX10-NEXT: s_endpgm 481; 482; GFX11-LABEL: test_scratch_load_i8_sext_s: 483; GFX11: ; %bb.0: 484; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1 485; GFX11-NEXT: s_waitcnt vmcnt(0) 486; GFX11-NEXT: flat_store_b32 v[0:1], v2 487; GFX11-NEXT: s_endpgm 488; 489; GFX12-LABEL: test_scratch_load_i8_sext_s: 490; GFX12: ; %bb.0: 491; GFX12-NEXT: scratch_load_i8 v2, off, s0 offset:1 492; GFX12-NEXT: s_wait_loadcnt 0x0 493; GFX12-NEXT: flat_store_b32 v[0:1], v2 494; GFX12-NEXT: s_endpgm 495 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 496 %load = load i8, ptr addrspace(5) %gep, align 4 497 %ext = sext i8 %load to i32 498 store i32 %ext, ptr %out, align 4 499 ret void 500} 501 502define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ptr %out) { 503; GFX10-LABEL: test_scratch_load_i16_zext_s: 504; GFX10: ; %bb.0: 505; GFX10-NEXT: s_add_u32 s0, s0, s3 506; GFX10-NEXT: s_addc_u32 s1, s1, 0 507; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 508; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 509; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2 510; GFX10-NEXT: s_waitcnt vmcnt(0) 511; GFX10-NEXT: flat_store_dword v[0:1], v2 512; GFX10-NEXT: s_endpgm 513; 514; GFX11-LABEL: test_scratch_load_i16_zext_s: 515; GFX11: ; %bb.0: 516; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2 517; GFX11-NEXT: s_waitcnt vmcnt(0) 518; GFX11-NEXT: flat_store_b32 v[0:1], v2 519; GFX11-NEXT: s_endpgm 520; 521; GFX12-LABEL: test_scratch_load_i16_zext_s: 522; GFX12: ; %bb.0: 523; GFX12-NEXT: scratch_load_u16 v2, off, s0 offset:2 524; GFX12-NEXT: s_wait_loadcnt 0x0 525; GFX12-NEXT: flat_store_b32 v[0:1], v2 526; GFX12-NEXT: s_endpgm 527 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 528 %load = load i16, ptr addrspace(5) %gep, align 4 529 %ext = zext i16 %load to i32 530 store i32 %ext, ptr %out, align 4 531 ret void 532} 533 534define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ptr %out) { 535; GFX10-LABEL: test_scratch_load_i16_sext_s: 536; GFX10: ; %bb.0: 537; GFX10-NEXT: s_add_u32 s0, s0, s3 538; GFX10-NEXT: s_addc_u32 s1, s1, 0 539; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 540; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 541; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2 542; GFX10-NEXT: s_waitcnt vmcnt(0) 543; GFX10-NEXT: flat_store_dword v[0:1], v2 544; GFX10-NEXT: s_endpgm 545; 546; GFX11-LABEL: test_scratch_load_i16_sext_s: 547; GFX11: ; %bb.0: 548; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2 549; GFX11-NEXT: s_waitcnt vmcnt(0) 550; GFX11-NEXT: flat_store_b32 v[0:1], v2 551; GFX11-NEXT: s_endpgm 552; 553; GFX12-LABEL: test_scratch_load_i16_sext_s: 554; GFX12: ; %bb.0: 555; GFX12-NEXT: scratch_load_i16 v2, off, s0 offset:2 556; GFX12-NEXT: s_wait_loadcnt 0x0 557; GFX12-NEXT: flat_store_b32 v[0:1], v2 558; GFX12-NEXT: s_endpgm 559 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 560 %load = load i16, ptr addrspace(5) %gep, align 4 561 %ext = sext i16 %load to i32 562 store i32 %ext, ptr %out, align 4 563 ret void 564} 565 566define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) { 567; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_s: 568; GFX10: ; %bb.0: ; %bb 569; GFX10-NEXT: s_add_u32 s0, s0, s3 570; GFX10-NEXT: s_addc_u32 s1, s1, 0 571; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 572; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 573; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000 574; GFX10-NEXT: s_add_i32 s2, s2, 1 575; GFX10-NEXT: scratch_load_ubyte_d16 v2, off, s2 576; GFX10-NEXT: s_waitcnt vmcnt(0) 577; GFX10-NEXT: flat_store_dword v[0:1], v2 578; GFX10-NEXT: s_endpgm 579; 580; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s: 581; GFX11: ; %bb.0: ; %bb 582; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000 583; GFX11-NEXT: s_add_i32 s0, s0, 1 584; GFX11-NEXT: scratch_load_d16_u8 v2, off, s0 585; GFX11-NEXT: s_waitcnt vmcnt(0) 586; GFX11-NEXT: flat_store_b32 v[0:1], v2 587; GFX11-NEXT: s_endpgm 588; 589; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_s: 590; GFX12: ; %bb.0: ; %bb 591; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 592; GFX12-NEXT: scratch_load_d16_u8 v2, off, s0 offset:1 593; GFX12-NEXT: s_wait_loadcnt 0x0 594; GFX12-NEXT: flat_store_b32 v[0:1], v2 595; GFX12-NEXT: s_endpgm 596bb: 597 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 598 %load_lo = load i8, ptr addrspace(5) %gep 599 %ext = zext i8 %load_lo to i16 600 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0 601 store <2 x i16> %result, ptr %out, align 4 602 ret void 603} 604 605define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) { 606; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_s: 607; GFX10: ; %bb.0: ; %bb 608; GFX10-NEXT: s_add_u32 s0, s0, s3 609; GFX10-NEXT: s_addc_u32 s1, s1, 0 610; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 611; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 612; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000 613; GFX10-NEXT: s_add_i32 s2, s2, 1 614; GFX10-NEXT: scratch_load_sbyte_d16 v2, off, s2 615; GFX10-NEXT: s_waitcnt vmcnt(0) 616; GFX10-NEXT: flat_store_dword v[0:1], v2 617; GFX10-NEXT: s_endpgm 618; 619; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s: 620; GFX11: ; %bb.0: ; %bb 621; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000 622; GFX11-NEXT: s_add_i32 s0, s0, 1 623; GFX11-NEXT: scratch_load_d16_i8 v2, off, s0 624; GFX11-NEXT: s_waitcnt vmcnt(0) 625; GFX11-NEXT: flat_store_b32 v[0:1], v2 626; GFX11-NEXT: s_endpgm 627; 628; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_s: 629; GFX12: ; %bb.0: ; %bb 630; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 631; GFX12-NEXT: scratch_load_d16_i8 v2, off, s0 offset:1 632; GFX12-NEXT: s_wait_loadcnt 0x0 633; GFX12-NEXT: flat_store_b32 v[0:1], v2 634; GFX12-NEXT: s_endpgm 635bb: 636 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 637 %load_lo = load i8, ptr addrspace(5) %gep 638 %ext = sext i8 %load_lo to i16 639 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0 640 store <2 x i16> %result, ptr %out, align 4 641 ret void 642} 643 644define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) { 645; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_s: 646; GFX10: ; %bb.0: ; %bb 647; GFX10-NEXT: s_add_u32 s0, s0, s3 648; GFX10-NEXT: s_addc_u32 s1, s1, 0 649; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 650; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 651; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000 652; GFX10-NEXT: s_add_i32 s2, s2, 2 653; GFX10-NEXT: scratch_load_short_d16 v2, off, s2 654; GFX10-NEXT: s_waitcnt vmcnt(0) 655; GFX10-NEXT: flat_store_dword v[0:1], v2 656; GFX10-NEXT: s_endpgm 657; 658; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s: 659; GFX11: ; %bb.0: ; %bb 660; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000 661; GFX11-NEXT: s_add_i32 s0, s0, 2 662; GFX11-NEXT: scratch_load_d16_b16 v2, off, s0 663; GFX11-NEXT: s_waitcnt vmcnt(0) 664; GFX11-NEXT: flat_store_b32 v[0:1], v2 665; GFX11-NEXT: s_endpgm 666; 667; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_s: 668; GFX12: ; %bb.0: ; %bb 669; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 670; GFX12-NEXT: scratch_load_d16_b16 v2, off, s0 offset:2 671; GFX12-NEXT: s_wait_loadcnt 0x0 672; GFX12-NEXT: flat_store_b32 v[0:1], v2 673; GFX12-NEXT: s_endpgm 674bb: 675 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 676 %load_lo = load i16, ptr addrspace(5) %gep 677 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0 678 store <2 x i16> %result, ptr %out, align 4 679 ret void 680} 681 682 683define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) { 684; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_s: 685; GFX10: ; %bb.0: ; %bb 686; GFX10-NEXT: s_add_u32 s0, s0, s3 687; GFX10-NEXT: s_addc_u32 s1, s1, 0 688; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 689; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 690; GFX10-NEXT: v_mov_b32_e32 v2, -1 691; GFX10-NEXT: s_add_i32 s2, s2, 1 692; GFX10-NEXT: scratch_load_ubyte_d16_hi v2, off, s2 693; GFX10-NEXT: s_waitcnt vmcnt(0) 694; GFX10-NEXT: flat_store_dword v[0:1], v2 695; GFX10-NEXT: s_endpgm 696; 697; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s: 698; GFX11: ; %bb.0: ; %bb 699; GFX11-NEXT: v_mov_b32_e32 v2, -1 700; GFX11-NEXT: s_add_i32 s0, s0, 1 701; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0 702; GFX11-NEXT: s_waitcnt vmcnt(0) 703; GFX11-NEXT: flat_store_b32 v[0:1], v2 704; GFX11-NEXT: s_endpgm 705; 706; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_s: 707; GFX12: ; %bb.0: ; %bb 708; GFX12-NEXT: v_mov_b32_e32 v2, -1 709; GFX12-NEXT: scratch_load_d16_hi_u8 v2, off, s0 offset:1 710; GFX12-NEXT: s_wait_loadcnt 0x0 711; GFX12-NEXT: flat_store_b32 v[0:1], v2 712; GFX12-NEXT: s_endpgm 713bb: 714 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 715 %load_lo = load i8, ptr addrspace(5) %gep 716 %ext = zext i8 %load_lo to i16 717 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1 718 store <2 x i16> %result, ptr %out, align 4 719 ret void 720} 721 722define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) { 723; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_s: 724; GFX10: ; %bb.0: ; %bb 725; GFX10-NEXT: s_add_u32 s0, s0, s3 726; GFX10-NEXT: s_addc_u32 s1, s1, 0 727; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 728; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 729; GFX10-NEXT: v_mov_b32_e32 v2, -1 730; GFX10-NEXT: s_add_i32 s2, s2, 1 731; GFX10-NEXT: scratch_load_sbyte_d16_hi v2, off, s2 732; GFX10-NEXT: s_waitcnt vmcnt(0) 733; GFX10-NEXT: flat_store_dword v[0:1], v2 734; GFX10-NEXT: s_endpgm 735; 736; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s: 737; GFX11: ; %bb.0: ; %bb 738; GFX11-NEXT: v_mov_b32_e32 v2, -1 739; GFX11-NEXT: s_add_i32 s0, s0, 1 740; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0 741; GFX11-NEXT: s_waitcnt vmcnt(0) 742; GFX11-NEXT: flat_store_b32 v[0:1], v2 743; GFX11-NEXT: s_endpgm 744; 745; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_s: 746; GFX12: ; %bb.0: ; %bb 747; GFX12-NEXT: v_mov_b32_e32 v2, -1 748; GFX12-NEXT: scratch_load_d16_hi_i8 v2, off, s0 offset:1 749; GFX12-NEXT: s_wait_loadcnt 0x0 750; GFX12-NEXT: flat_store_b32 v[0:1], v2 751; GFX12-NEXT: s_endpgm 752bb: 753 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 754 %load_lo = load i8, ptr addrspace(5) %gep 755 %ext = sext i8 %load_lo to i16 756 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1 757 store <2 x i16> %result, ptr %out, align 4 758 ret void 759} 760 761define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) { 762; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_s: 763; GFX10: ; %bb.0: ; %bb 764; GFX10-NEXT: s_add_u32 s0, s0, s3 765; GFX10-NEXT: s_addc_u32 s1, s1, 0 766; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 767; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 768; GFX10-NEXT: v_mov_b32_e32 v2, -1 769; GFX10-NEXT: s_add_i32 s2, s2, 2 770; GFX10-NEXT: scratch_load_short_d16_hi v2, off, s2 771; GFX10-NEXT: s_waitcnt vmcnt(0) 772; GFX10-NEXT: flat_store_dword v[0:1], v2 773; GFX10-NEXT: s_endpgm 774; 775; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s: 776; GFX11: ; %bb.0: ; %bb 777; GFX11-NEXT: v_mov_b32_e32 v2, -1 778; GFX11-NEXT: s_add_i32 s0, s0, 2 779; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0 780; GFX11-NEXT: s_waitcnt vmcnt(0) 781; GFX11-NEXT: flat_store_b32 v[0:1], v2 782; GFX11-NEXT: s_endpgm 783; 784; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_s: 785; GFX12: ; %bb.0: ; %bb 786; GFX12-NEXT: v_mov_b32_e32 v2, -1 787; GFX12-NEXT: scratch_load_d16_hi_b16 v2, off, s0 offset:2 788; GFX12-NEXT: s_wait_loadcnt 0x0 789; GFX12-NEXT: flat_store_b32 v[0:1], v2 790; GFX12-NEXT: s_endpgm 791bb: 792 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 793 %load_lo = load i16, ptr addrspace(5) %gep 794 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1 795 store <2 x i16> %result, ptr %out, align 4 796 ret void 797} 798 799define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) { 800; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_s: 801; GFX10: ; %bb.0: ; %bb 802; GFX10-NEXT: s_add_u32 s0, s0, s3 803; GFX10-NEXT: s_addc_u32 s1, s1, 0 804; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 805; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 806; GFX10-NEXT: flat_load_dword v0, v[0:1] 807; GFX10-NEXT: s_add_i32 s2, s2, 4 808; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 809; GFX10-NEXT: scratch_store_byte_d16_hi off, v0, s2 810; GFX10-NEXT: s_endpgm 811; 812; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s: 813; GFX11: ; %bb.0: ; %bb 814; GFX11-NEXT: flat_load_b32 v0, v[0:1] 815; GFX11-NEXT: s_add_i32 s0, s0, 4 816; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 817; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0 818; GFX11-NEXT: s_endpgm 819; 820; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s: 821; GFX12: ; %bb.0: ; %bb 822; GFX12-NEXT: flat_load_b32 v0, v[0:1] 823; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 824; GFX12-NEXT: scratch_store_d16_hi_b8 off, v0, s0 offset:4 825; GFX12-NEXT: s_endpgm 826bb: 827 %load = load <4 x i8>, ptr %in 828 %element = extractelement <4 x i8> %load, i32 2 829 %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1 830 store i8 %element, ptr addrspace(5) %gep, align 4 831 ret void 832} 833 834define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) { 835; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_s: 836; GFX10: ; %bb.0: ; %bb 837; GFX10-NEXT: s_add_u32 s0, s0, s3 838; GFX10-NEXT: s_addc_u32 s1, s1, 0 839; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 840; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 841; GFX10-NEXT: flat_load_dword v0, v[0:1] 842; GFX10-NEXT: s_add_i32 s2, s2, 2 843; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 844; GFX10-NEXT: scratch_store_short_d16_hi off, v0, s2 845; GFX10-NEXT: s_endpgm 846; 847; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s: 848; GFX11: ; %bb.0: ; %bb 849; GFX11-NEXT: flat_load_b32 v0, v[0:1] 850; GFX11-NEXT: s_add_i32 s0, s0, 2 851; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 852; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0 853; GFX11-NEXT: s_endpgm 854; 855; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s: 856; GFX12: ; %bb.0: ; %bb 857; GFX12-NEXT: flat_load_b32 v0, v[0:1] 858; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 859; GFX12-NEXT: scratch_store_d16_hi_b16 off, v0, s0 offset:2 860; GFX12-NEXT: s_endpgm 861bb: 862 %load = load <2 x i16>, ptr %in 863 %element = extractelement <2 x i16> %load, i32 1 864 %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1 865 store i16 %element, ptr addrspace(5) %gep, align 4 866 ret void 867} 868 869 870 871 872; sgpr + vgpr offset 873 874define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 875; GFX10-LABEL: test_scratch_load_i8_zext_svs: 876; GFX10: ; %bb.0: 877; GFX10-NEXT: s_add_u32 s0, s0, s3 878; GFX10-NEXT: s_addc_u32 s1, s1, 0 879; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 880; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 881; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 882; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 883; GFX10-NEXT: s_waitcnt vmcnt(0) 884; GFX10-NEXT: flat_store_dword v[1:2], v0 885; GFX10-NEXT: s_endpgm 886; 887; GFX11-LABEL: test_scratch_load_i8_zext_svs: 888; GFX11: ; %bb.0: 889; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 890; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1 891; GFX11-NEXT: s_waitcnt vmcnt(0) 892; GFX11-NEXT: flat_store_b32 v[1:2], v0 893; GFX11-NEXT: s_endpgm 894; 895; GFX12-LABEL: test_scratch_load_i8_zext_svs: 896; GFX12: ; %bb.0: 897; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 898; GFX12-NEXT: scratch_load_u8 v0, v0, s0 offset:1 899; GFX12-NEXT: s_wait_loadcnt 0x0 900; GFX12-NEXT: flat_store_b32 v[1:2], v0 901; GFX12-NEXT: s_endpgm 902 %voffset4 = mul i32 %voffset, 4 903 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 904 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1 905 %load = load i8, ptr addrspace(5) %gep, align 4 906 %ext = zext i8 %load to i32 907 store i32 %ext, ptr %out, align 4 908 ret void 909} 910 911define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 912; GFX10-LABEL: test_scratch_load_i8_sext_svs: 913; GFX10: ; %bb.0: 914; GFX10-NEXT: s_add_u32 s0, s0, s3 915; GFX10-NEXT: s_addc_u32 s1, s1, 0 916; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 917; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 918; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 919; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1 920; GFX10-NEXT: s_waitcnt vmcnt(0) 921; GFX10-NEXT: flat_store_dword v[1:2], v0 922; GFX10-NEXT: s_endpgm 923; 924; GFX11-LABEL: test_scratch_load_i8_sext_svs: 925; GFX11: ; %bb.0: 926; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 927; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1 928; GFX11-NEXT: s_waitcnt vmcnt(0) 929; GFX11-NEXT: flat_store_b32 v[1:2], v0 930; GFX11-NEXT: s_endpgm 931; 932; GFX12-LABEL: test_scratch_load_i8_sext_svs: 933; GFX12: ; %bb.0: 934; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 935; GFX12-NEXT: scratch_load_i8 v0, v0, s0 offset:1 936; GFX12-NEXT: s_wait_loadcnt 0x0 937; GFX12-NEXT: flat_store_b32 v[1:2], v0 938; GFX12-NEXT: s_endpgm 939 %voffset4 = mul i32 %voffset, 4 940 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 941 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1 942 %load = load i8, ptr addrspace(5) %gep, align 4 943 %ext = sext i8 %load to i32 944 store i32 %ext, ptr %out, align 4 945 ret void 946} 947 948define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 949; GFX10-LABEL: test_scratch_load_i16_zext_svs: 950; GFX10: ; %bb.0: 951; GFX10-NEXT: s_add_u32 s0, s0, s3 952; GFX10-NEXT: s_addc_u32 s1, s1, 0 953; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 954; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 955; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 956; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2 957; GFX10-NEXT: s_waitcnt vmcnt(0) 958; GFX10-NEXT: flat_store_dword v[1:2], v0 959; GFX10-NEXT: s_endpgm 960; 961; GFX11-LABEL: test_scratch_load_i16_zext_svs: 962; GFX11: ; %bb.0: 963; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 964; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 965; GFX11-NEXT: s_waitcnt vmcnt(0) 966; GFX11-NEXT: flat_store_b32 v[1:2], v0 967; GFX11-NEXT: s_endpgm 968; 969; GFX12-LABEL: test_scratch_load_i16_zext_svs: 970; GFX12: ; %bb.0: 971; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 972; GFX12-NEXT: scratch_load_u16 v0, v0, s0 offset:2 973; GFX12-NEXT: s_wait_loadcnt 0x0 974; GFX12-NEXT: flat_store_b32 v[1:2], v0 975; GFX12-NEXT: s_endpgm 976 %voffset4 = mul i32 %voffset, 4 977 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 978 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1 979 %load = load i16, ptr addrspace(5) %gep, align 4 980 %ext = zext i16 %load to i32 981 store i32 %ext, ptr %out, align 4 982 ret void 983} 984 985define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 986; GFX10-LABEL: test_scratch_load_i16_sext_svs: 987; GFX10: ; %bb.0: 988; GFX10-NEXT: s_add_u32 s0, s0, s3 989; GFX10-NEXT: s_addc_u32 s1, s1, 0 990; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 991; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 992; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 993; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2 994; GFX10-NEXT: s_waitcnt vmcnt(0) 995; GFX10-NEXT: flat_store_dword v[1:2], v0 996; GFX10-NEXT: s_endpgm 997; 998; GFX11-LABEL: test_scratch_load_i16_sext_svs: 999; GFX11: ; %bb.0: 1000; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 1001; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2 1002; GFX11-NEXT: s_waitcnt vmcnt(0) 1003; GFX11-NEXT: flat_store_b32 v[1:2], v0 1004; GFX11-NEXT: s_endpgm 1005; 1006; GFX12-LABEL: test_scratch_load_i16_sext_svs: 1007; GFX12: ; %bb.0: 1008; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1009; GFX12-NEXT: scratch_load_i16 v0, v0, s0 offset:2 1010; GFX12-NEXT: s_wait_loadcnt 0x0 1011; GFX12-NEXT: flat_store_b32 v[1:2], v0 1012; GFX12-NEXT: s_endpgm 1013 %voffset4 = mul i32 %voffset, 4 1014 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1015 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1 1016 %load = load i16, ptr addrspace(5) %gep, align 4 1017 %ext = sext i16 %load to i32 1018 store i32 %ext, ptr %out, align 4 1019 ret void 1020} 1021 1022define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 1023; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs: 1024; GFX10: ; %bb.0: ; %bb 1025; GFX10-NEXT: s_add_u32 s0, s0, s3 1026; GFX10-NEXT: s_addc_u32 s1, s1, 0 1027; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1028; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1029; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1030; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000 1031; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1 1032; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off 1033; GFX10-NEXT: s_waitcnt vmcnt(0) 1034; GFX10-NEXT: flat_store_dword v[1:2], v3 1035; GFX10-NEXT: s_endpgm 1036; 1037; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs: 1038; GFX11: ; %bb.0: ; %bb 1039; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 1040; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1041; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 1042; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off 1043; GFX11-NEXT: s_waitcnt vmcnt(0) 1044; GFX11-NEXT: flat_store_b32 v[1:2], v3 1045; GFX11-NEXT: s_endpgm 1046; 1047; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs: 1048; GFX12: ; %bb.0: ; %bb 1049; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 1050; GFX12-NEXT: scratch_load_d16_u8 v3, v0, s0 offset:1 1051; GFX12-NEXT: s_wait_loadcnt 0x0 1052; GFX12-NEXT: flat_store_b32 v[1:2], v3 1053; GFX12-NEXT: s_endpgm 1054bb: 1055 %voffset4 = mul i32 %voffset, 4 1056 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1057 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1 1058 %load_lo = load i8, ptr addrspace(5) %gep 1059 %ext = zext i8 %load_lo to i16 1060 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0 1061 store <2 x i16> %result, ptr %out, align 4 1062 ret void 1063} 1064 1065define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 1066; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs: 1067; GFX10: ; %bb.0: ; %bb 1068; GFX10-NEXT: s_add_u32 s0, s0, s3 1069; GFX10-NEXT: s_addc_u32 s1, s1, 0 1070; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1071; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1072; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1073; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000 1074; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1 1075; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off 1076; GFX10-NEXT: s_waitcnt vmcnt(0) 1077; GFX10-NEXT: flat_store_dword v[1:2], v3 1078; GFX10-NEXT: s_endpgm 1079; 1080; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs: 1081; GFX11: ; %bb.0: ; %bb 1082; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 1083; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1084; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 1085; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off 1086; GFX11-NEXT: s_waitcnt vmcnt(0) 1087; GFX11-NEXT: flat_store_b32 v[1:2], v3 1088; GFX11-NEXT: s_endpgm 1089; 1090; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs: 1091; GFX12: ; %bb.0: ; %bb 1092; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 1093; GFX12-NEXT: scratch_load_d16_i8 v3, v0, s0 offset:1 1094; GFX12-NEXT: s_wait_loadcnt 0x0 1095; GFX12-NEXT: flat_store_b32 v[1:2], v3 1096; GFX12-NEXT: s_endpgm 1097bb: 1098 %voffset4 = mul i32 %voffset, 4 1099 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1100 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1 1101 %load_lo = load i8, ptr addrspace(5) %gep 1102 %ext = sext i8 %load_lo to i16 1103 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0 1104 store <2 x i16> %result, ptr %out, align 4 1105 ret void 1106} 1107 1108define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 1109; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_svs: 1110; GFX10: ; %bb.0: ; %bb 1111; GFX10-NEXT: s_add_u32 s0, s0, s3 1112; GFX10-NEXT: s_addc_u32 s1, s1, 0 1113; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1114; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1115; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1116; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000 1117; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2 1118; GFX10-NEXT: scratch_load_short_d16 v3, v0, off 1119; GFX10-NEXT: s_waitcnt vmcnt(0) 1120; GFX10-NEXT: flat_store_dword v[1:2], v3 1121; GFX10-NEXT: s_endpgm 1122; 1123; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs: 1124; GFX11: ; %bb.0: ; %bb 1125; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 1126; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1127; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 1128; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off 1129; GFX11-NEXT: s_waitcnt vmcnt(0) 1130; GFX11-NEXT: flat_store_b32 v[1:2], v3 1131; GFX11-NEXT: s_endpgm 1132; 1133; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs: 1134; GFX12: ; %bb.0: ; %bb 1135; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0 1136; GFX12-NEXT: scratch_load_d16_b16 v3, v0, s0 offset:2 1137; GFX12-NEXT: s_wait_loadcnt 0x0 1138; GFX12-NEXT: flat_store_b32 v[1:2], v3 1139; GFX12-NEXT: s_endpgm 1140bb: 1141 %voffset4 = mul i32 %voffset, 4 1142 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1143 %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1 1144 %load_lo = load i16, ptr addrspace(5) %gep 1145 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0 1146 store <2 x i16> %result, ptr %out, align 4 1147 ret void 1148} 1149 1150 1151define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 1152; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: 1153; GFX10: ; %bb.0: ; %bb 1154; GFX10-NEXT: s_add_u32 s0, s0, s3 1155; GFX10-NEXT: s_addc_u32 s1, s1, 0 1156; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1157; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1158; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1159; GFX10-NEXT: v_mov_b32_e32 v3, -1 1160; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1 1161; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off 1162; GFX10-NEXT: s_waitcnt vmcnt(0) 1163; GFX10-NEXT: flat_store_dword v[1:2], v3 1164; GFX10-NEXT: s_endpgm 1165; 1166; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: 1167; GFX11: ; %bb.0: ; %bb 1168; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 1169; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1170; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 1171; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off 1172; GFX11-NEXT: s_waitcnt vmcnt(0) 1173; GFX11-NEXT: flat_store_b32 v[1:2], v3 1174; GFX11-NEXT: s_endpgm 1175; 1176; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: 1177; GFX12: ; %bb.0: ; %bb 1178; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 1179; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, s0 offset:1 1180; GFX12-NEXT: s_wait_loadcnt 0x0 1181; GFX12-NEXT: flat_store_b32 v[1:2], v3 1182; GFX12-NEXT: s_endpgm 1183bb: 1184 %voffset4 = mul i32 %voffset, 4 1185 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1186 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1 1187 %load_lo = load i8, ptr addrspace(5) %gep 1188 %ext = zext i8 %load_lo to i16 1189 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1 1190 store <2 x i16> %result, ptr %out, align 4 1191 ret void 1192} 1193 1194define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 1195; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: 1196; GFX10: ; %bb.0: ; %bb 1197; GFX10-NEXT: s_add_u32 s0, s0, s3 1198; GFX10-NEXT: s_addc_u32 s1, s1, 0 1199; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1200; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1201; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1202; GFX10-NEXT: v_mov_b32_e32 v3, -1 1203; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1 1204; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off 1205; GFX10-NEXT: s_waitcnt vmcnt(0) 1206; GFX10-NEXT: flat_store_dword v[1:2], v3 1207; GFX10-NEXT: s_endpgm 1208; 1209; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: 1210; GFX11: ; %bb.0: ; %bb 1211; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 1212; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1213; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 1214; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off 1215; GFX11-NEXT: s_waitcnt vmcnt(0) 1216; GFX11-NEXT: flat_store_b32 v[1:2], v3 1217; GFX11-NEXT: s_endpgm 1218; 1219; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: 1220; GFX12: ; %bb.0: ; %bb 1221; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 1222; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, s0 offset:1 1223; GFX12-NEXT: s_wait_loadcnt 0x0 1224; GFX12-NEXT: flat_store_b32 v[1:2], v3 1225; GFX12-NEXT: s_endpgm 1226bb: 1227 %voffset4 = mul i32 %voffset, 4 1228 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1229 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1 1230 %load_lo = load i8, ptr addrspace(5) %gep 1231 %ext = sext i8 %load_lo to i16 1232 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1 1233 store <2 x i16> %result, ptr %out, align 4 1234 ret void 1235} 1236 1237define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) { 1238; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_svs: 1239; GFX10: ; %bb.0: ; %bb 1240; GFX10-NEXT: s_add_u32 s0, s0, s3 1241; GFX10-NEXT: s_addc_u32 s1, s1, 0 1242; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1243; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1244; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1245; GFX10-NEXT: v_mov_b32_e32 v3, -1 1246; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2 1247; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off 1248; GFX10-NEXT: s_waitcnt vmcnt(0) 1249; GFX10-NEXT: flat_store_dword v[1:2], v3 1250; GFX10-NEXT: s_endpgm 1251; 1252; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs: 1253; GFX11: ; %bb.0: ; %bb 1254; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 1255; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1256; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 1257; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off 1258; GFX11-NEXT: s_waitcnt vmcnt(0) 1259; GFX11-NEXT: flat_store_b32 v[1:2], v3 1260; GFX11-NEXT: s_endpgm 1261; 1262; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs: 1263; GFX12: ; %bb.0: ; %bb 1264; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 1265; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, s0 offset:2 1266; GFX12-NEXT: s_wait_loadcnt 0x0 1267; GFX12-NEXT: flat_store_b32 v[1:2], v3 1268; GFX12-NEXT: s_endpgm 1269bb: 1270 %voffset4 = mul i32 %voffset, 4 1271 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 1272 %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1 1273 %load_lo = load i16, ptr addrspace(5) %gep 1274 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1 1275 store <2 x i16> %result, ptr %out, align 4 1276 ret void 1277} 1278 1279define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) { 1280; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_svs: 1281; GFX10: ; %bb.0: ; %bb 1282; GFX10-NEXT: s_add_u32 s0, s0, s3 1283; GFX10-NEXT: s_addc_u32 s1, s1, 0 1284; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1285; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1286; GFX10-NEXT: flat_load_dword v0, v[0:1] 1287; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2 1288; GFX10-NEXT: v_add3_u32 v1, s2, v1, 4 1289; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1290; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off 1291; GFX10-NEXT: s_endpgm 1292; 1293; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs: 1294; GFX11: ; %bb.0: ; %bb 1295; GFX11-NEXT: flat_load_b32 v0, v[0:1] 1296; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2 1297; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1298; GFX11-NEXT: v_add3_u32 v1, s0, v1, 4 1299; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1300; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off 1301; GFX11-NEXT: s_endpgm 1302; 1303; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs: 1304; GFX12: ; %bb.0: ; %bb 1305; GFX12-NEXT: flat_load_b32 v0, v[0:1] 1306; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2 1307; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1308; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, s0 offset:4 1309; GFX12-NEXT: s_endpgm 1310bb: 1311 %load = load <4 x i8>, ptr %in 1312 %element = extractelement <4 x i8> %load, i32 2 1313 %voffset4 = mul i32 %voffset, 4 1314 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4 1315 %gep = getelementptr <4 x i8>, ptr addrspace(5) %gep0, i64 1 1316 store i8 %element, ptr addrspace(5) %gep, align 4 1317 ret void 1318} 1319 1320define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) { 1321; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_svs: 1322; GFX10: ; %bb.0: ; %bb 1323; GFX10-NEXT: s_add_u32 s0, s0, s3 1324; GFX10-NEXT: s_addc_u32 s1, s1, 0 1325; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1326; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1327; GFX10-NEXT: flat_load_dword v0, v[0:1] 1328; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2 1329; GFX10-NEXT: v_add3_u32 v1, s2, v1, 2 1330; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1331; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off 1332; GFX10-NEXT: s_endpgm 1333; 1334; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs: 1335; GFX11: ; %bb.0: ; %bb 1336; GFX11-NEXT: flat_load_b32 v0, v[0:1] 1337; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2 1338; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1339; GFX11-NEXT: v_add3_u32 v1, s0, v1, 2 1340; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1341; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off 1342; GFX11-NEXT: s_endpgm 1343; 1344; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs: 1345; GFX12: ; %bb.0: ; %bb 1346; GFX12-NEXT: flat_load_b32 v0, v[0:1] 1347; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2 1348; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1349; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, s0 offset:2 1350; GFX12-NEXT: s_endpgm 1351bb: 1352 %load = load <2 x i16>, ptr %in 1353 %element = extractelement <2 x i16> %load, i32 1 1354 %voffset4 = mul i32 %voffset, 4 1355 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4 1356 %gep = getelementptr <2 x i8>, ptr addrspace(5) %gep0, i64 1 1357 store i16 %element, ptr addrspace(5) %gep, align 4 1358 ret void 1359} 1360