1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 5; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 7 8define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) { 9; GFX9-LABEL: insertelement_s_v2i16_s_s: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 12; GFX9-NEXT: s_and_b32 s1, s5, 1 13; GFX9-NEXT: s_lshl_b32 s1, s1, 4 14; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 15; GFX9-NEXT: s_lshl_b32 s2, s2, s1 16; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 17; GFX9-NEXT: s_waitcnt lgkmcnt(0) 18; GFX9-NEXT: s_andn2_b32 s0, s0, s1 19; GFX9-NEXT: s_or_b32 s0, s0, s2 20; GFX9-NEXT: v_mov_b32_e32 v0, 0 21; GFX9-NEXT: v_mov_b32_e32 v1, 0 22; GFX9-NEXT: v_mov_b32_e32 v2, s0 23; GFX9-NEXT: global_store_dword v[0:1], v2, off 24; GFX9-NEXT: s_endpgm 25; 26; GFX8-LABEL: insertelement_s_v2i16_s_s: 27; GFX8: ; %bb.0: 28; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 29; GFX8-NEXT: s_and_b32 s1, s5, 1 30; GFX8-NEXT: s_lshl_b32 s1, s1, 4 31; GFX8-NEXT: s_and_b32 s2, s4, 0xffff 32; GFX8-NEXT: s_lshl_b32 s2, s2, s1 33; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 34; GFX8-NEXT: s_waitcnt lgkmcnt(0) 35; GFX8-NEXT: s_andn2_b32 s0, s0, s1 36; GFX8-NEXT: s_or_b32 s0, s0, s2 37; GFX8-NEXT: v_mov_b32_e32 v0, 0 38; GFX8-NEXT: v_mov_b32_e32 v1, 0 39; GFX8-NEXT: v_mov_b32_e32 v2, s0 40; GFX8-NEXT: flat_store_dword v[0:1], v2 41; GFX8-NEXT: s_endpgm 42; 43; GFX7-LABEL: insertelement_s_v2i16_s_s: 44; GFX7: ; %bb.0: 45; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 46; GFX7-NEXT: s_and_b32 s1, s5, 1 47; GFX7-NEXT: s_lshl_b32 s1, s1, 4 48; GFX7-NEXT: s_and_b32 s2, s4, 0xffff 49; GFX7-NEXT: s_lshl_b32 s2, s2, s1 50; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 51; GFX7-NEXT: s_waitcnt lgkmcnt(0) 52; GFX7-NEXT: s_andn2_b32 s0, s0, s1 53; GFX7-NEXT: s_or_b32 s2, s0, s2 54; GFX7-NEXT: s_mov_b64 s[0:1], 0 55; GFX7-NEXT: v_mov_b32_e32 v0, s2 56; GFX7-NEXT: s_mov_b32 s2, -1 57; GFX7-NEXT: s_mov_b32 s3, 0xf000 58; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 59; GFX7-NEXT: s_endpgm 60; 61; GFX10-LABEL: insertelement_s_v2i16_s_s: 62; GFX10: ; %bb.0: 63; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 64; GFX10-NEXT: s_and_b32 s1, s5, 1 65; GFX10-NEXT: s_and_b32 s2, s4, 0xffff 66; GFX10-NEXT: s_lshl_b32 s1, s1, 4 67; GFX10-NEXT: v_mov_b32_e32 v0, 0 68; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 69; GFX10-NEXT: s_lshl_b32 s1, s2, s1 70; GFX10-NEXT: v_mov_b32_e32 v1, 0 71; GFX10-NEXT: s_waitcnt lgkmcnt(0) 72; GFX10-NEXT: s_andn2_b32 s0, s0, s3 73; GFX10-NEXT: s_or_b32 s0, s0, s1 74; GFX10-NEXT: v_mov_b32_e32 v2, s0 75; GFX10-NEXT: global_store_dword v[0:1], v2, off 76; GFX10-NEXT: s_endpgm 77; 78; GFX11-LABEL: insertelement_s_v2i16_s_s: 79; GFX11: ; %bb.0: 80; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 81; GFX11-NEXT: s_and_b32 s1, s5, 1 82; GFX11-NEXT: s_and_b32 s2, s4, 0xffff 83; GFX11-NEXT: s_lshl_b32 s1, s1, 4 84; GFX11-NEXT: v_mov_b32_e32 v0, 0 85; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s1 86; GFX11-NEXT: s_lshl_b32 s1, s2, s1 87; GFX11-NEXT: s_waitcnt lgkmcnt(0) 88; GFX11-NEXT: s_and_not1_b32 s0, s0, s3 89; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 90; GFX11-NEXT: s_or_b32 s0, s0, s1 91; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 92; GFX11-NEXT: global_store_b32 v[0:1], v2, off 93; GFX11-NEXT: s_endpgm 94 %vec = load <2 x i16>, ptr addrspace(4) %ptr 95 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 96 store <2 x i16> %insert, ptr addrspace(1) null 97 ret void 98} 99 100define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) { 101; GFX9-LABEL: insertelement_v_v2i16_s_s: 102; GFX9: ; %bb.0: 103; GFX9-NEXT: global_load_dword v2, v[0:1], off 104; GFX9-NEXT: s_and_b32 s0, s3, 1 105; GFX9-NEXT: s_lshl_b32 s0, s0, 4 106; GFX9-NEXT: s_and_b32 s1, s2, 0xffff 107; GFX9-NEXT: s_lshl_b32 s1, s1, s0 108; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 109; GFX9-NEXT: s_not_b32 s0, s0 110; GFX9-NEXT: v_mov_b32_e32 v3, s1 111; GFX9-NEXT: v_mov_b32_e32 v0, 0 112; GFX9-NEXT: v_mov_b32_e32 v1, 0 113; GFX9-NEXT: s_waitcnt vmcnt(0) 114; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3 115; GFX9-NEXT: global_store_dword v[0:1], v2, off 116; GFX9-NEXT: s_endpgm 117; 118; GFX8-LABEL: insertelement_v_v2i16_s_s: 119; GFX8: ; %bb.0: 120; GFX8-NEXT: flat_load_dword v0, v[0:1] 121; GFX8-NEXT: s_and_b32 s0, s3, 1 122; GFX8-NEXT: s_and_b32 s1, s2, 0xffff 123; GFX8-NEXT: s_lshl_b32 s0, s0, 4 124; GFX8-NEXT: s_lshl_b32 s1, s1, s0 125; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 126; GFX8-NEXT: s_not_b32 s0, s0 127; GFX8-NEXT: s_waitcnt vmcnt(0) 128; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 129; GFX8-NEXT: v_mov_b32_e32 v0, 0 130; GFX8-NEXT: v_mov_b32_e32 v1, 0 131; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 132; GFX8-NEXT: flat_store_dword v[0:1], v2 133; GFX8-NEXT: s_endpgm 134; 135; GFX7-LABEL: insertelement_v_v2i16_s_s: 136; GFX7: ; %bb.0: 137; GFX7-NEXT: s_mov_b32 s6, 0 138; GFX7-NEXT: s_mov_b32 s7, 0xf000 139; GFX7-NEXT: s_mov_b64 s[4:5], 0 140; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 141; GFX7-NEXT: s_and_b32 s0, s3, 1 142; GFX7-NEXT: s_and_b32 s1, s2, 0xffff 143; GFX7-NEXT: s_lshl_b32 s0, s0, 4 144; GFX7-NEXT: s_lshl_b32 s1, s1, s0 145; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 146; GFX7-NEXT: s_not_b32 s0, s0 147; GFX7-NEXT: s_mov_b64 s[4:5], 0 148; GFX7-NEXT: s_mov_b32 s6, -1 149; GFX7-NEXT: s_waitcnt vmcnt(0) 150; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 151; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 152; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 153; GFX7-NEXT: s_endpgm 154; 155; GFX10-LABEL: insertelement_v_v2i16_s_s: 156; GFX10: ; %bb.0: 157; GFX10-NEXT: global_load_dword v2, v[0:1], off 158; GFX10-NEXT: s_and_b32 s0, s3, 1 159; GFX10-NEXT: s_and_b32 s1, s2, 0xffff 160; GFX10-NEXT: s_lshl_b32 s0, s0, 4 161; GFX10-NEXT: v_mov_b32_e32 v0, 0 162; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 163; GFX10-NEXT: s_lshl_b32 s0, s1, s0 164; GFX10-NEXT: s_not_b32 s1, s2 165; GFX10-NEXT: v_mov_b32_e32 v1, 0 166; GFX10-NEXT: s_waitcnt vmcnt(0) 167; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 168; GFX10-NEXT: global_store_dword v[0:1], v2, off 169; GFX10-NEXT: s_endpgm 170; 171; GFX11-LABEL: insertelement_v_v2i16_s_s: 172; GFX11: ; %bb.0: 173; GFX11-NEXT: global_load_b32 v2, v[0:1], off 174; GFX11-NEXT: s_and_b32 s0, s3, 1 175; GFX11-NEXT: s_and_b32 s1, s2, 0xffff 176; GFX11-NEXT: s_lshl_b32 s0, s0, 4 177; GFX11-NEXT: v_mov_b32_e32 v0, 0 178; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 179; GFX11-NEXT: s_lshl_b32 s0, s1, s0 180; GFX11-NEXT: s_not_b32 s1, s2 181; GFX11-NEXT: v_mov_b32_e32 v1, 0 182; GFX11-NEXT: s_waitcnt vmcnt(0) 183; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 184; GFX11-NEXT: global_store_b32 v[0:1], v2, off 185; GFX11-NEXT: s_endpgm 186 %vec = load <2 x i16>, ptr addrspace(1 ) %ptr 187 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 188 store <2 x i16> %insert, ptr addrspace(1) null 189 ret void 190} 191 192define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) { 193; GFX9-LABEL: insertelement_s_v2i16_v_s: 194; GFX9: ; %bb.0: 195; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 196; GFX9-NEXT: s_and_b32 s1, s4, 1 197; GFX9-NEXT: s_lshl_b32 s1, s1, 4 198; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s1 199; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 201; GFX9-NEXT: s_andn2_b32 s0, s0, s2 202; GFX9-NEXT: v_mov_b32_e32 v3, s0 203; GFX9-NEXT: v_mov_b32_e32 v0, 0 204; GFX9-NEXT: v_mov_b32_e32 v1, 0 205; GFX9-NEXT: v_lshl_or_b32 v2, v2, s1, v3 206; GFX9-NEXT: global_store_dword v[0:1], v2, off 207; GFX9-NEXT: s_endpgm 208; 209; GFX8-LABEL: insertelement_s_v2i16_v_s: 210; GFX8: ; %bb.0: 211; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 212; GFX8-NEXT: s_and_b32 s1, s4, 1 213; GFX8-NEXT: s_lshl_b32 s1, s1, 4 214; GFX8-NEXT: v_mov_b32_e32 v1, s1 215; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 216; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: s_andn2_b32 s0, s0, s1 219; GFX8-NEXT: v_mov_b32_e32 v0, 0 220; GFX8-NEXT: v_mov_b32_e32 v1, 0 221; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 222; GFX8-NEXT: flat_store_dword v[0:1], v2 223; GFX8-NEXT: s_endpgm 224; 225; GFX7-LABEL: insertelement_s_v2i16_v_s: 226; GFX7: ; %bb.0: 227; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 228; GFX7-NEXT: s_and_b32 s1, s4, 1 229; GFX7-NEXT: s_lshl_b32 s1, s1, 4 230; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 231; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 232; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 233; GFX7-NEXT: s_waitcnt lgkmcnt(0) 234; GFX7-NEXT: s_andn2_b32 s0, s0, s1 235; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 236; GFX7-NEXT: s_mov_b64 s[0:1], 0 237; GFX7-NEXT: s_mov_b32 s2, -1 238; GFX7-NEXT: s_mov_b32 s3, 0xf000 239; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 240; GFX7-NEXT: s_endpgm 241; 242; GFX10-LABEL: insertelement_s_v2i16_v_s: 243; GFX10: ; %bb.0: 244; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 245; GFX10-NEXT: s_and_b32 s1, s4, 1 246; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 247; GFX10-NEXT: s_lshl_b32 s1, s1, 4 248; GFX10-NEXT: v_mov_b32_e32 v0, 0 249; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s1 250; GFX10-NEXT: v_mov_b32_e32 v1, 0 251; GFX10-NEXT: s_waitcnt lgkmcnt(0) 252; GFX10-NEXT: s_andn2_b32 s0, s0, s2 253; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0 254; GFX10-NEXT: global_store_dword v[0:1], v2, off 255; GFX10-NEXT: s_endpgm 256; 257; GFX11-LABEL: insertelement_s_v2i16_v_s: 258; GFX11: ; %bb.0: 259; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 260; GFX11-NEXT: s_and_b32 s1, s4, 1 261; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 262; GFX11-NEXT: s_lshl_b32 s1, s1, 4 263; GFX11-NEXT: v_mov_b32_e32 v0, 0 264; GFX11-NEXT: v_mov_b32_e32 v1, 0 265; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s1 266; GFX11-NEXT: s_waitcnt lgkmcnt(0) 267; GFX11-NEXT: s_and_not1_b32 s0, s0, s2 268; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 269; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 270; GFX11-NEXT: global_store_b32 v[0:1], v2, off 271; GFX11-NEXT: s_endpgm 272 %vec = load <2 x i16>, ptr addrspace(4) %ptr 273 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 274 store <2 x i16> %insert, ptr addrspace(1) null 275 ret void 276} 277 278define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) { 279; GFX9-LABEL: insertelement_s_v2i16_s_v: 280; GFX9: ; %bb.0: 281; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 282; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 283; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 284; GFX9-NEXT: s_and_b32 s1, s4, 0xffff 285; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 286; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 287; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 288; GFX9-NEXT: v_not_b32_e32 v3, v0 289; GFX9-NEXT: v_mov_b32_e32 v0, 0 290; GFX9-NEXT: v_mov_b32_e32 v1, 0 291; GFX9-NEXT: s_waitcnt lgkmcnt(0) 292; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 293; GFX9-NEXT: global_store_dword v[0:1], v2, off 294; GFX9-NEXT: s_endpgm 295; 296; GFX8-LABEL: insertelement_s_v2i16_s_v: 297; GFX8: ; %bb.0: 298; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 299; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 300; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 301; GFX8-NEXT: s_and_b32 s1, s4, 0xffff 302; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff 303; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 304; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 305; GFX8-NEXT: v_not_b32_e32 v0, v0 306; GFX8-NEXT: s_waitcnt lgkmcnt(0) 307; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 308; GFX8-NEXT: v_mov_b32_e32 v0, 0 309; GFX8-NEXT: v_mov_b32_e32 v1, 0 310; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 311; GFX8-NEXT: flat_store_dword v[0:1], v2 312; GFX8-NEXT: s_endpgm 313; 314; GFX7-LABEL: insertelement_s_v2i16_s_v: 315; GFX7: ; %bb.0: 316; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 317; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 318; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 319; GFX7-NEXT: s_and_b32 s1, s4, 0xffff 320; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 321; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 322; GFX7-NEXT: v_not_b32_e32 v0, v0 323; GFX7-NEXT: s_waitcnt lgkmcnt(0) 324; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 325; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 326; GFX7-NEXT: s_mov_b64 s[0:1], 0 327; GFX7-NEXT: s_mov_b32 s2, -1 328; GFX7-NEXT: s_mov_b32 s3, 0xf000 329; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 330; GFX7-NEXT: s_endpgm 331; 332; GFX10-LABEL: insertelement_s_v2i16_s_v: 333; GFX10: ; %bb.0: 334; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 335; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 336; GFX10-NEXT: s_and_b32 s1, s4, 0xffff 337; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 338; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff 339; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 340; GFX10-NEXT: v_not_b32_e32 v3, v1 341; GFX10-NEXT: v_mov_b32_e32 v0, 0 342; GFX10-NEXT: v_mov_b32_e32 v1, 0 343; GFX10-NEXT: s_waitcnt lgkmcnt(0) 344; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 345; GFX10-NEXT: global_store_dword v[0:1], v2, off 346; GFX10-NEXT: s_endpgm 347; 348; GFX11-LABEL: insertelement_s_v2i16_s_v: 349; GFX11: ; %bb.0: 350; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 351; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 352; GFX11-NEXT: s_and_b32 s1, s4, 0xffff 353; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 354; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 355; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff 356; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1 357; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 358; GFX11-NEXT: v_not_b32_e32 v3, v1 359; GFX11-NEXT: v_mov_b32_e32 v0, 0 360; GFX11-NEXT: v_mov_b32_e32 v1, 0 361; GFX11-NEXT: s_waitcnt lgkmcnt(0) 362; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 363; GFX11-NEXT: global_store_b32 v[0:1], v2, off 364; GFX11-NEXT: s_endpgm 365 %vec = load <2 x i16>, ptr addrspace(4) %ptr 366 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 367 store <2 x i16> %insert, ptr addrspace(1) null 368 ret void 369} 370 371define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) { 372; GFX9-LABEL: insertelement_s_v2i16_v_v: 373; GFX9: ; %bb.0: 374; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 375; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 376; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 377; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 378; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 379; GFX9-NEXT: v_lshlrev_b32_e32 v0, v1, v2 380; GFX9-NEXT: v_not_b32_e32 v2, v0 381; GFX9-NEXT: v_mov_b32_e32 v0, 0 382; GFX9-NEXT: v_mov_b32_e32 v1, 0 383; GFX9-NEXT: s_waitcnt lgkmcnt(0) 384; GFX9-NEXT: v_and_or_b32 v2, s0, v2, v3 385; GFX9-NEXT: global_store_dword v[0:1], v2, off 386; GFX9-NEXT: s_endpgm 387; 388; GFX8-LABEL: insertelement_s_v2i16_v_v: 389; GFX8: ; %bb.0: 390; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 391; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 392; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 393; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff 394; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 395; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 396; GFX8-NEXT: v_not_b32_e32 v0, v0 397; GFX8-NEXT: s_waitcnt lgkmcnt(0) 398; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 399; GFX8-NEXT: v_mov_b32_e32 v0, 0 400; GFX8-NEXT: v_mov_b32_e32 v1, 0 401; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 402; GFX8-NEXT: flat_store_dword v[0:1], v2 403; GFX8-NEXT: s_endpgm 404; 405; GFX7-LABEL: insertelement_s_v2i16_v_v: 406; GFX7: ; %bb.0: 407; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 408; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 409; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 410; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 411; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 412; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 413; GFX7-NEXT: v_not_b32_e32 v1, v1 414; GFX7-NEXT: s_waitcnt lgkmcnt(0) 415; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 416; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 417; GFX7-NEXT: s_mov_b64 s[0:1], 0 418; GFX7-NEXT: s_mov_b32 s2, -1 419; GFX7-NEXT: s_mov_b32 s3, 0xf000 420; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 421; GFX7-NEXT: s_endpgm 422; 423; GFX10-LABEL: insertelement_s_v2i16_v_v: 424; GFX10: ; %bb.0: 425; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 426; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 427; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 428; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff 429; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 430; GFX10-NEXT: v_mov_b32_e32 v0, 0 431; GFX10-NEXT: v_mov_b32_e32 v1, 0 432; GFX10-NEXT: v_not_b32_e32 v2, v2 433; GFX10-NEXT: s_waitcnt lgkmcnt(0) 434; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 435; GFX10-NEXT: global_store_dword v[0:1], v2, off 436; GFX10-NEXT: s_endpgm 437; 438; GFX11-LABEL: insertelement_s_v2i16_v_v: 439; GFX11: ; %bb.0: 440; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 441; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 442; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 443; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 444; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 445; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff 446; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 447; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0 448; GFX11-NEXT: v_mov_b32_e32 v1, 0 449; GFX11-NEXT: v_not_b32_e32 v2, v2 450; GFX11-NEXT: s_waitcnt lgkmcnt(0) 451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 452; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 453; GFX11-NEXT: global_store_b32 v[0:1], v2, off 454; GFX11-NEXT: s_endpgm 455 %vec = load <2 x i16>, ptr addrspace(4) %ptr 456 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 457 store <2 x i16> %insert, ptr addrspace(1) null 458 ret void 459} 460 461define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) { 462; GFX9-LABEL: insertelement_v_v2i16_s_v: 463; GFX9: ; %bb.0: 464; GFX9-NEXT: global_load_dword v3, v[0:1], off 465; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 466; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 467; GFX9-NEXT: s_and_b32 s0, s2, 0xffff 468; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 469; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s0 470; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 471; GFX9-NEXT: v_not_b32_e32 v4, v0 472; GFX9-NEXT: v_mov_b32_e32 v0, 0 473; GFX9-NEXT: v_mov_b32_e32 v1, 0 474; GFX9-NEXT: s_waitcnt vmcnt(0) 475; GFX9-NEXT: v_and_or_b32 v2, v3, v4, v2 476; GFX9-NEXT: global_store_dword v[0:1], v2, off 477; GFX9-NEXT: s_endpgm 478; 479; GFX8-LABEL: insertelement_v_v2i16_s_v: 480; GFX8: ; %bb.0: 481; GFX8-NEXT: flat_load_dword v0, v[0:1] 482; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 483; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff 484; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 485; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 486; GFX8-NEXT: s_and_b32 s0, s2, 0xffff 487; GFX8-NEXT: v_not_b32_e32 v1, v1 488; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 489; GFX8-NEXT: s_waitcnt vmcnt(0) 490; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 491; GFX8-NEXT: v_mov_b32_e32 v0, 0 492; GFX8-NEXT: v_mov_b32_e32 v1, 0 493; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 494; GFX8-NEXT: flat_store_dword v[0:1], v2 495; GFX8-NEXT: s_endpgm 496; 497; GFX7-LABEL: insertelement_v_v2i16_s_v: 498; GFX7: ; %bb.0: 499; GFX7-NEXT: s_mov_b32 s6, 0 500; GFX7-NEXT: s_mov_b32 s7, 0xf000 501; GFX7-NEXT: s_mov_b64 s[4:5], 0 502; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 503; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 504; GFX7-NEXT: s_and_b32 s0, s2, 0xffff 505; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 506; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 507; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 508; GFX7-NEXT: v_not_b32_e32 v1, v1 509; GFX7-NEXT: s_mov_b64 s[4:5], 0 510; GFX7-NEXT: s_mov_b32 s6, -1 511; GFX7-NEXT: s_waitcnt vmcnt(0) 512; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 513; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 514; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 515; GFX7-NEXT: s_endpgm 516; 517; GFX10-LABEL: insertelement_v_v2i16_s_v: 518; GFX10: ; %bb.0: 519; GFX10-NEXT: global_load_dword v3, v[0:1], off 520; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 521; GFX10-NEXT: s_and_b32 s0, s2, 0xffff 522; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 523; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff 524; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 525; GFX10-NEXT: v_not_b32_e32 v4, v1 526; GFX10-NEXT: v_mov_b32_e32 v0, 0 527; GFX10-NEXT: v_mov_b32_e32 v1, 0 528; GFX10-NEXT: s_waitcnt vmcnt(0) 529; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 530; GFX10-NEXT: global_store_dword v[0:1], v2, off 531; GFX10-NEXT: s_endpgm 532; 533; GFX11-LABEL: insertelement_v_v2i16_s_v: 534; GFX11: ; %bb.0: 535; GFX11-NEXT: global_load_b32 v3, v[0:1], off 536; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 537; GFX11-NEXT: s_and_b32 s0, s2, 0xffff 538; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 539; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 540; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff 541; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0 542; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 543; GFX11-NEXT: v_not_b32_e32 v4, v1 544; GFX11-NEXT: v_mov_b32_e32 v0, 0 545; GFX11-NEXT: v_mov_b32_e32 v1, 0 546; GFX11-NEXT: s_waitcnt vmcnt(0) 547; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 548; GFX11-NEXT: global_store_b32 v[0:1], v2, off 549; GFX11-NEXT: s_endpgm 550 %vec = load <2 x i16>, ptr addrspace(1) %ptr 551 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 552 store <2 x i16> %insert, ptr addrspace(1) null 553 ret void 554} 555 556define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) { 557; GFX9-LABEL: insertelement_v_v2i16_v_s: 558; GFX9: ; %bb.0: 559; GFX9-NEXT: global_load_dword v3, v[0:1], off 560; GFX9-NEXT: s_and_b32 s0, s2, 1 561; GFX9-NEXT: s_lshl_b32 s0, s0, 4 562; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 563; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 564; GFX9-NEXT: s_not_b32 s0, s0 565; GFX9-NEXT: v_mov_b32_e32 v0, 0 566; GFX9-NEXT: v_mov_b32_e32 v1, 0 567; GFX9-NEXT: s_waitcnt vmcnt(0) 568; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 569; GFX9-NEXT: global_store_dword v[0:1], v2, off 570; GFX9-NEXT: s_endpgm 571; 572; GFX8-LABEL: insertelement_v_v2i16_v_s: 573; GFX8: ; %bb.0: 574; GFX8-NEXT: flat_load_dword v0, v[0:1] 575; GFX8-NEXT: s_and_b32 s0, s2, 1 576; GFX8-NEXT: s_lshl_b32 s0, s0, 4 577; GFX8-NEXT: v_mov_b32_e32 v1, s0 578; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 579; GFX8-NEXT: s_not_b32 s0, s0 580; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 581; GFX8-NEXT: s_waitcnt vmcnt(0) 582; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 583; GFX8-NEXT: v_mov_b32_e32 v0, 0 584; GFX8-NEXT: v_mov_b32_e32 v1, 0 585; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 586; GFX8-NEXT: flat_store_dword v[0:1], v2 587; GFX8-NEXT: s_endpgm 588; 589; GFX7-LABEL: insertelement_v_v2i16_v_s: 590; GFX7: ; %bb.0: 591; GFX7-NEXT: s_mov_b32 s6, 0 592; GFX7-NEXT: s_mov_b32 s7, 0xf000 593; GFX7-NEXT: s_mov_b64 s[4:5], 0 594; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 595; GFX7-NEXT: s_and_b32 s0, s2, 1 596; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 597; GFX7-NEXT: s_lshl_b32 s0, s0, 4 598; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 599; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 600; GFX7-NEXT: s_not_b32 s0, s0 601; GFX7-NEXT: s_mov_b64 s[4:5], 0 602; GFX7-NEXT: s_mov_b32 s6, -1 603; GFX7-NEXT: s_waitcnt vmcnt(0) 604; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 605; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 606; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 607; GFX7-NEXT: s_endpgm 608; 609; GFX10-LABEL: insertelement_v_v2i16_v_s: 610; GFX10: ; %bb.0: 611; GFX10-NEXT: global_load_dword v3, v[0:1], off 612; GFX10-NEXT: s_and_b32 s0, s2, 1 613; GFX10-NEXT: v_mov_b32_e32 v0, 0 614; GFX10-NEXT: s_lshl_b32 s0, s0, 4 615; GFX10-NEXT: v_mov_b32_e32 v1, 0 616; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 617; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 618; GFX10-NEXT: s_not_b32 s0, s0 619; GFX10-NEXT: s_waitcnt vmcnt(0) 620; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 621; GFX10-NEXT: global_store_dword v[0:1], v2, off 622; GFX10-NEXT: s_endpgm 623; 624; GFX11-LABEL: insertelement_v_v2i16_v_s: 625; GFX11: ; %bb.0: 626; GFX11-NEXT: global_load_b32 v3, v[0:1], off 627; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 628; GFX11-NEXT: s_and_b32 s0, s2, 1 629; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 630; GFX11-NEXT: s_lshl_b32 s0, s0, 4 631; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 632; GFX11-NEXT: v_lshlrev_b32_e32 v2, s0, v0 633; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 634; GFX11-NEXT: v_mov_b32_e32 v0, 0 635; GFX11-NEXT: v_mov_b32_e32 v1, 0 636; GFX11-NEXT: s_not_b32 s0, s0 637; GFX11-NEXT: s_waitcnt vmcnt(0) 638; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 639; GFX11-NEXT: global_store_b32 v[0:1], v2, off 640; GFX11-NEXT: s_endpgm 641 %vec = load <2 x i16>, ptr addrspace(1) %ptr 642 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 643 store <2 x i16> %insert, ptr addrspace(1) null 644 ret void 645} 646 647define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) { 648; GFX9-LABEL: insertelement_v_v2i16_v_v: 649; GFX9: ; %bb.0: 650; GFX9-NEXT: global_load_dword v4, v[0:1], off 651; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 652; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 653; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 654; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 655; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 656; GFX9-NEXT: v_not_b32_e32 v3, v0 657; GFX9-NEXT: v_mov_b32_e32 v0, 0 658; GFX9-NEXT: v_mov_b32_e32 v1, 0 659; GFX9-NEXT: s_waitcnt vmcnt(0) 660; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 661; GFX9-NEXT: global_store_dword v[0:1], v2, off 662; GFX9-NEXT: s_endpgm 663; 664; GFX8-LABEL: insertelement_v_v2i16_v_v: 665; GFX8: ; %bb.0: 666; GFX8-NEXT: flat_load_dword v0, v[0:1] 667; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 668; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff 669; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 670; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 671; GFX8-NEXT: v_not_b32_e32 v1, v1 672; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 673; GFX8-NEXT: s_waitcnt vmcnt(0) 674; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 675; GFX8-NEXT: v_mov_b32_e32 v0, 0 676; GFX8-NEXT: v_mov_b32_e32 v1, 0 677; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 678; GFX8-NEXT: flat_store_dword v[0:1], v2 679; GFX8-NEXT: s_endpgm 680; 681; GFX7-LABEL: insertelement_v_v2i16_v_v: 682; GFX7: ; %bb.0: 683; GFX7-NEXT: s_mov_b32 s2, 0 684; GFX7-NEXT: s_mov_b32 s3, 0xf000 685; GFX7-NEXT: s_mov_b64 s[0:1], 0 686; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 687; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 688; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 689; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 690; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 691; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 692; GFX7-NEXT: v_not_b32_e32 v1, v1 693; GFX7-NEXT: s_mov_b64 s[0:1], 0 694; GFX7-NEXT: s_mov_b32 s2, -1 695; GFX7-NEXT: s_waitcnt vmcnt(0) 696; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 697; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 698; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 699; GFX7-NEXT: s_endpgm 700; 701; GFX10-LABEL: insertelement_v_v2i16_v_v: 702; GFX10: ; %bb.0: 703; GFX10-NEXT: global_load_dword v4, v[0:1], off 704; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 705; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 706; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff 707; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 708; GFX10-NEXT: v_not_b32_e32 v3, v1 709; GFX10-NEXT: v_mov_b32_e32 v0, 0 710; GFX10-NEXT: v_mov_b32_e32 v1, 0 711; GFX10-NEXT: s_waitcnt vmcnt(0) 712; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 713; GFX10-NEXT: global_store_dword v[0:1], v2, off 714; GFX10-NEXT: s_endpgm 715; 716; GFX11-LABEL: insertelement_v_v2i16_v_v: 717; GFX11: ; %bb.0: 718; GFX11-NEXT: global_load_b32 v4, v[0:1], off 719; GFX11-NEXT: v_and_b32_e32 v0, 1, v3 720; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 721; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 722; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 723; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff 724; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 725; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1 726; GFX11-NEXT: v_mov_b32_e32 v1, 0 727; GFX11-NEXT: v_not_b32_e32 v2, v2 728; GFX11-NEXT: s_waitcnt vmcnt(0) 729; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 730; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 731; GFX11-NEXT: global_store_b32 v[0:1], v2, off 732; GFX11-NEXT: s_endpgm 733 %vec = load <2 x i16>, ptr addrspace(1) %ptr 734 %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx 735 store <2 x i16> %insert, ptr addrspace(1) null 736 ret void 737} 738 739; FIXME: 3 element load/store legalization 740; define amdgpu_ps void @insertelement_s_v3i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) { 741; %vec = load <3 x i16>, ptr addrspace(4) %ptr 742; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 743; store <3 x i16> %insert, ptr addrspace(1) null 744; ret void 745; } 746 747; define amdgpu_ps void @insertelement_v_v3i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) { 748; %vec = load <3 x i16>, ptr addrspace(1 ) %ptr 749; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 750; store <3 x i16> %insert, ptr addrspace(1) null 751; ret void 752; } 753 754; define amdgpu_ps void @insertelement_s_v3i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) { 755; %vec = load <3 x i16>, ptr addrspace(4) %ptr 756; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 757; store <3 x i16> %insert, ptr addrspace(1) null 758; ret void 759; } 760 761; define amdgpu_ps void @insertelement_s_v3i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) { 762; %vec = load <3 x i16>, ptr addrspace(4) %ptr 763; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 764; store <3 x i16> %insert, ptr addrspace(1) null 765; ret void 766; } 767 768; define amdgpu_ps void @insertelement_s_v3i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) { 769; %vec = load <3 x i16>, ptr addrspace(4) %ptr 770; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 771; store <3 x i16> %insert, ptr addrspace(1) null 772; ret void 773; } 774 775; define amdgpu_ps void @insertelement_v_v3i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) { 776; %vec = load <3 x i16>, ptr addrspace(1) %ptr 777; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 778; store <3 x i16> %insert, ptr addrspace(1) null 779; ret void 780; } 781 782; define amdgpu_ps void @insertelement_v_v3i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) { 783; %vec = load <3 x i16>, ptr addrspace(1) %ptr 784; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 785; store <3 x i16> %insert, ptr addrspace(1) null 786; ret void 787; } 788 789; define amdgpu_ps void @insertelement_v_v3i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) { 790; %vec = load <3 x i16>, ptr addrspace(1) %ptr 791; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx 792; store <3 x i16> %insert, ptr addrspace(1) null 793; ret void 794; } 795 796define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) { 797; GFX9-LABEL: insertelement_v_v4i16_s_s: 798; GFX9: ; %bb.0: 799; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 800; GFX9-NEXT: s_and_b32 s1, s3, 1 801; GFX9-NEXT: s_lshr_b32 s0, s3, 1 802; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 803; GFX9-NEXT: s_lshl_b32 s1, s1, 4 804; GFX9-NEXT: s_lshl_b32 s2, s2, s1 805; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 806; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 807; GFX9-NEXT: s_not_b32 s1, s1 808; GFX9-NEXT: v_mov_b32_e32 v4, s2 809; GFX9-NEXT: v_mov_b32_e32 v2, 0 810; GFX9-NEXT: v_mov_b32_e32 v3, 0 811; GFX9-NEXT: s_waitcnt vmcnt(0) 812; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 813; GFX9-NEXT: v_and_or_b32 v4, v5, s1, v4 814; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 815; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 816; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 817; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 818; GFX9-NEXT: s_endpgm 819; 820; GFX8-LABEL: insertelement_v_v4i16_s_s: 821; GFX8: ; %bb.0: 822; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 823; GFX8-NEXT: s_and_b32 s1, s3, 1 824; GFX8-NEXT: s_lshr_b32 s0, s3, 1 825; GFX8-NEXT: s_and_b32 s2, s2, 0xffff 826; GFX8-NEXT: s_lshl_b32 s1, s1, 4 827; GFX8-NEXT: s_lshl_b32 s2, s2, s1 828; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 829; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 830; GFX8-NEXT: s_not_b32 s1, s1 831; GFX8-NEXT: v_mov_b32_e32 v2, 0 832; GFX8-NEXT: v_mov_b32_e32 v3, 0 833; GFX8-NEXT: s_waitcnt vmcnt(0) 834; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 835; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 836; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 837; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 838; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 839; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 840; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 841; GFX8-NEXT: s_endpgm 842; 843; GFX7-LABEL: insertelement_v_v4i16_s_s: 844; GFX7: ; %bb.0: 845; GFX7-NEXT: s_mov_b32 s6, 0 846; GFX7-NEXT: s_mov_b32 s7, 0xf000 847; GFX7-NEXT: s_mov_b64 s[4:5], 0 848; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 849; GFX7-NEXT: s_and_b32 s1, s3, 1 850; GFX7-NEXT: s_lshr_b32 s0, s3, 1 851; GFX7-NEXT: s_and_b32 s2, s2, 0xffff 852; GFX7-NEXT: s_lshl_b32 s1, s1, 4 853; GFX7-NEXT: s_lshl_b32 s2, s2, s1 854; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 855; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 856; GFX7-NEXT: s_not_b32 s1, s1 857; GFX7-NEXT: s_mov_b64 s[4:5], 0 858; GFX7-NEXT: s_mov_b32 s6, -1 859; GFX7-NEXT: s_waitcnt vmcnt(0) 860; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 861; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 862; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 863; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 864; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 865; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 866; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 867; GFX7-NEXT: s_endpgm 868; 869; GFX10-LABEL: insertelement_v_v4i16_s_s: 870; GFX10: ; %bb.0: 871; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 872; GFX10-NEXT: s_lshr_b32 s0, s3, 1 873; GFX10-NEXT: s_and_b32 s1, s3, 1 874; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 875; GFX10-NEXT: s_lshl_b32 s1, s1, 4 876; GFX10-NEXT: s_and_b32 s2, s2, 0xffff 877; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 878; GFX10-NEXT: s_lshl_b32 s1, s2, s1 879; GFX10-NEXT: s_not_b32 s2, s3 880; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 881; GFX10-NEXT: s_waitcnt vmcnt(0) 882; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo 883; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 884; GFX10-NEXT: v_mov_b32_e32 v2, 0 885; GFX10-NEXT: v_mov_b32_e32 v3, 0 886; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 887; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 888; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 889; GFX10-NEXT: s_endpgm 890; 891; GFX11-LABEL: insertelement_v_v4i16_s_s: 892; GFX11: ; %bb.0: 893; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 894; GFX11-NEXT: s_lshr_b32 s0, s3, 1 895; GFX11-NEXT: s_and_b32 s1, s3, 1 896; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 897; GFX11-NEXT: s_lshl_b32 s1, s1, 4 898; GFX11-NEXT: s_and_b32 s2, s2, 0xffff 899; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s1 900; GFX11-NEXT: s_lshl_b32 s1, s2, s1 901; GFX11-NEXT: s_not_b32 s2, s3 902; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 903; GFX11-NEXT: s_waitcnt vmcnt(0) 904; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo 905; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 906; GFX11-NEXT: v_and_or_b32 v4, v2, s2, s1 907; GFX11-NEXT: v_mov_b32_e32 v2, 0 908; GFX11-NEXT: v_mov_b32_e32 v3, 0 909; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 910; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 911; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 912; GFX11-NEXT: s_endpgm 913 %vec = load <4 x i16>, ptr addrspace(1 ) %ptr 914 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 915 store <4 x i16> %insert, ptr addrspace(1) null 916 ret void 917} 918 919define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) { 920; GFX9-LABEL: insertelement_s_v4i16_v_s: 921; GFX9: ; %bb.0: 922; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 923; GFX9-NEXT: s_lshr_b32 s2, s4, 1 924; GFX9-NEXT: s_cmp_eq_u32 s2, 1 925; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 926; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 928; GFX9-NEXT: s_cselect_b32 s3, s1, s0 929; GFX9-NEXT: s_and_b32 s4, s4, 1 930; GFX9-NEXT: s_lshl_b32 s4, s4, 4 931; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s4 932; GFX9-NEXT: s_andn2_b32 s3, s3, s5 933; GFX9-NEXT: v_mov_b32_e32 v1, s3 934; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 935; GFX9-NEXT: v_mov_b32_e32 v0, s0 936; GFX9-NEXT: v_mov_b32_e32 v1, s1 937; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 938; GFX9-NEXT: v_mov_b32_e32 v2, 0 939; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 940; GFX9-NEXT: v_mov_b32_e32 v3, 0 941; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 942; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 943; GFX9-NEXT: s_endpgm 944; 945; GFX8-LABEL: insertelement_s_v4i16_v_s: 946; GFX8: ; %bb.0: 947; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 948; GFX8-NEXT: s_lshr_b32 s2, s4, 1 949; GFX8-NEXT: s_cmp_eq_u32 s2, 1 950; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 951; GFX8-NEXT: v_mov_b32_e32 v2, 0 952; GFX8-NEXT: s_waitcnt lgkmcnt(0) 953; GFX8-NEXT: s_cselect_b32 s3, s1, s0 954; GFX8-NEXT: s_and_b32 s4, s4, 1 955; GFX8-NEXT: s_lshl_b32 s4, s4, 4 956; GFX8-NEXT: v_mov_b32_e32 v1, s4 957; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 958; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 959; GFX8-NEXT: s_andn2_b32 s3, s3, s4 960; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 961; GFX8-NEXT: v_mov_b32_e32 v0, s0 962; GFX8-NEXT: v_mov_b32_e32 v1, s1 963; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 964; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 965; GFX8-NEXT: v_mov_b32_e32 v3, 0 966; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 967; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 968; GFX8-NEXT: s_endpgm 969; 970; GFX7-LABEL: insertelement_s_v4i16_v_s: 971; GFX7: ; %bb.0: 972; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 973; GFX7-NEXT: s_lshr_b32 s2, s4, 1 974; GFX7-NEXT: s_cmp_eq_u32 s2, 1 975; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 976; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 977; GFX7-NEXT: s_waitcnt lgkmcnt(0) 978; GFX7-NEXT: s_cselect_b32 s3, s1, s0 979; GFX7-NEXT: s_and_b32 s4, s4, 1 980; GFX7-NEXT: s_lshl_b32 s4, s4, 4 981; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 982; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 983; GFX7-NEXT: s_andn2_b32 s3, s3, s4 984; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 985; GFX7-NEXT: v_mov_b32_e32 v0, s0 986; GFX7-NEXT: v_mov_b32_e32 v1, s1 987; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 988; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 989; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 990; GFX7-NEXT: s_mov_b64 s[0:1], 0 991; GFX7-NEXT: s_mov_b32 s2, -1 992; GFX7-NEXT: s_mov_b32 s3, 0xf000 993; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 994; GFX7-NEXT: s_endpgm 995; 996; GFX10-LABEL: insertelement_s_v4i16_v_s: 997; GFX10: ; %bb.0: 998; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 999; GFX10-NEXT: s_lshr_b32 s2, s4, 1 1000; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 1001; GFX10-NEXT: s_cmp_eq_u32 s2, 1 1002; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 1003; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX10-NEXT: s_cselect_b32 s3, s1, s0 1005; GFX10-NEXT: s_and_b32 s4, s4, 1 1006; GFX10-NEXT: v_mov_b32_e32 v0, s0 1007; GFX10-NEXT: s_lshl_b32 s4, s4, 4 1008; GFX10-NEXT: v_mov_b32_e32 v1, s1 1009; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 1010; GFX10-NEXT: s_andn2_b32 s3, s3, s5 1011; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 1012; GFX10-NEXT: v_mov_b32_e32 v2, 0 1013; GFX10-NEXT: v_mov_b32_e32 v3, 0 1014; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo 1015; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 1016; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1017; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1018; GFX10-NEXT: s_endpgm 1019; 1020; GFX11-LABEL: insertelement_s_v4i16_v_s: 1021; GFX11: ; %bb.0: 1022; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 1023; GFX11-NEXT: s_lshr_b32 s2, s4, 1 1024; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 1025; GFX11-NEXT: s_cmp_eq_u32 s2, 1 1026; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 1027; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX11-NEXT: s_cselect_b32 s3, s1, s0 1029; GFX11-NEXT: s_and_b32 s4, s4, 1 1030; GFX11-NEXT: v_mov_b32_e32 v0, s0 1031; GFX11-NEXT: s_lshl_b32 s4, s4, 4 1032; GFX11-NEXT: v_mov_b32_e32 v1, s1 1033; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 1034; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1035; GFX11-NEXT: s_and_not1_b32 s3, s3, s5 1036; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3 1037; GFX11-NEXT: v_mov_b32_e32 v2, 0 1038; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1039; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4 1040; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 1041; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1042; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1043; GFX11-NEXT: s_endpgm 1044 %vec = load <4 x i16>, ptr addrspace(4) %ptr 1045 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1046 store <4 x i16> %insert, ptr addrspace(1) null 1047 ret void 1048} 1049 1050define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) { 1051; GFX9-LABEL: insertelement_s_v4i16_s_v: 1052; GFX9: ; %bb.0: 1053; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1054; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 1055; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1056; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1057; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 1058; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX9-NEXT: v_mov_b32_e32 v1, s0 1060; GFX9-NEXT: v_mov_b32_e32 v3, s1 1061; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 1062; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 1063; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1064; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s2 1065; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v4 1066; GFX9-NEXT: v_not_b32_e32 v0, v0 1067; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 1068; GFX9-NEXT: v_mov_b32_e32 v0, s0 1069; GFX9-NEXT: v_mov_b32_e32 v1, s1 1070; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1071; GFX9-NEXT: v_mov_b32_e32 v2, 0 1072; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1073; GFX9-NEXT: v_mov_b32_e32 v3, 0 1074; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1075; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1076; GFX9-NEXT: s_endpgm 1077; 1078; GFX8-LABEL: insertelement_s_v4i16_s_v: 1079; GFX8: ; %bb.0: 1080; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1081; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 1082; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 1083; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1084; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 1085; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX8-NEXT: v_mov_b32_e32 v1, s0 1087; GFX8-NEXT: v_mov_b32_e32 v3, s1 1088; GFX8-NEXT: s_and_b32 s2, s4, 0xffff 1089; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff 1090; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1091; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 1092; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 1093; GFX8-NEXT: v_not_b32_e32 v0, v0 1094; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 1095; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 1096; GFX8-NEXT: v_mov_b32_e32 v0, s0 1097; GFX8-NEXT: v_mov_b32_e32 v1, s1 1098; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1099; GFX8-NEXT: v_mov_b32_e32 v2, 0 1100; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1101; GFX8-NEXT: v_mov_b32_e32 v3, 0 1102; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1103; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1104; GFX8-NEXT: s_endpgm 1105; 1106; GFX7-LABEL: insertelement_s_v4i16_s_v: 1107; GFX7: ; %bb.0: 1108; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1109; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 1110; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 1111; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1112; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 1113; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX7-NEXT: v_mov_b32_e32 v1, s0 1115; GFX7-NEXT: v_mov_b32_e32 v3, s1 1116; GFX7-NEXT: s_and_b32 s2, s4, 0xffff 1117; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1118; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 1119; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 1120; GFX7-NEXT: v_not_b32_e32 v0, v0 1121; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 1122; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 1123; GFX7-NEXT: v_mov_b32_e32 v0, s0 1124; GFX7-NEXT: v_mov_b32_e32 v1, s1 1125; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1126; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1127; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1128; GFX7-NEXT: s_mov_b64 s[0:1], 0 1129; GFX7-NEXT: s_mov_b32 s2, -1 1130; GFX7-NEXT: s_mov_b32 s3, 0xf000 1131; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1132; GFX7-NEXT: s_endpgm 1133; 1134; GFX10-LABEL: insertelement_s_v4i16_s_v: 1135; GFX10: ; %bb.0: 1136; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1137; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 1138; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 1139; GFX10-NEXT: s_and_b32 s2, s4, 0xffff 1140; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1141; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 1142; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff 1143; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 1144; GFX10-NEXT: v_not_b32_e32 v2, v2 1145; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1146; GFX10-NEXT: v_mov_b32_e32 v0, s1 1147; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo 1148; GFX10-NEXT: v_mov_b32_e32 v0, s0 1149; GFX10-NEXT: v_mov_b32_e32 v1, s1 1150; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 1151; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 1152; GFX10-NEXT: v_mov_b32_e32 v2, 0 1153; GFX10-NEXT: v_mov_b32_e32 v3, 0 1154; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 1155; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 1156; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1157; GFX10-NEXT: s_endpgm 1158; 1159; GFX11-LABEL: insertelement_s_v4i16_s_v: 1160; GFX11: ; %bb.0: 1161; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 1162; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 1163; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v0 1164; GFX11-NEXT: s_and_b32 s2, s4, 0xffff 1165; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1166; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 1167; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1168; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1 1169; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo 1170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) 1171; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff 1172; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 1173; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1174; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 1175; GFX11-NEXT: v_not_b32_e32 v2, v2 1176; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 1177; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3 1178; GFX11-NEXT: v_mov_b32_e32 v2, 0 1179; GFX11-NEXT: v_mov_b32_e32 v3, 0 1180; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 1181; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 1182; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1183; GFX11-NEXT: s_endpgm 1184 %vec = load <4 x i16>, ptr addrspace(4) %ptr 1185 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1186 store <4 x i16> %insert, ptr addrspace(1) null 1187 ret void 1188} 1189 1190define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) { 1191; GFX9-LABEL: insertelement_s_v4i16_v_v: 1192; GFX9: ; %bb.0: 1193; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1194; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v1 1195; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1196; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 1197; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1198; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1199; GFX9-NEXT: v_mov_b32_e32 v3, s0 1200; GFX9-NEXT: v_mov_b32_e32 v4, s1 1201; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1202; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 1203; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1204; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v4 1205; GFX9-NEXT: v_not_b32_e32 v1, v1 1206; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 1207; GFX9-NEXT: v_mov_b32_e32 v0, s0 1208; GFX9-NEXT: v_mov_b32_e32 v1, s1 1209; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1210; GFX9-NEXT: v_mov_b32_e32 v2, 0 1211; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1212; GFX9-NEXT: v_mov_b32_e32 v3, 0 1213; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1214; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1215; GFX9-NEXT: s_endpgm 1216; 1217; GFX8-LABEL: insertelement_s_v4i16_v_v: 1218; GFX8: ; %bb.0: 1219; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1220; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v1 1221; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1222; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 1223; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1225; GFX8-NEXT: v_mov_b32_e32 v3, s0 1226; GFX8-NEXT: v_mov_b32_e32 v4, s1 1227; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1228; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff 1229; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1230; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 1231; GFX8-NEXT: v_not_b32_e32 v1, v1 1232; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 1233; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 1234; GFX8-NEXT: v_mov_b32_e32 v0, s0 1235; GFX8-NEXT: v_mov_b32_e32 v1, s1 1236; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1237; GFX8-NEXT: v_mov_b32_e32 v2, 0 1238; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1239; GFX8-NEXT: v_mov_b32_e32 v3, 0 1240; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1241; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1242; GFX8-NEXT: s_endpgm 1243; 1244; GFX7-LABEL: insertelement_s_v4i16_v_v: 1245; GFX7: ; %bb.0: 1246; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1247; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 1248; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 1249; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1250; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1251; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX7-NEXT: v_mov_b32_e32 v3, s0 1253; GFX7-NEXT: v_mov_b32_e32 v4, s1 1254; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 1255; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 1256; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 1257; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1258; GFX7-NEXT: v_not_b32_e32 v1, v1 1259; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 1260; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 1261; GFX7-NEXT: v_mov_b32_e32 v0, s0 1262; GFX7-NEXT: v_mov_b32_e32 v1, s1 1263; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1264; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1265; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1266; GFX7-NEXT: s_mov_b64 s[0:1], 0 1267; GFX7-NEXT: s_mov_b32 s2, -1 1268; GFX7-NEXT: s_mov_b32 s3, 0xf000 1269; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1270; GFX7-NEXT: s_endpgm 1271; 1272; GFX10-LABEL: insertelement_s_v4i16_v_v: 1273; GFX10: ; %bb.0: 1274; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1275; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 1276; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 1277; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1278; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 1279; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff 1280; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1281; GFX10-NEXT: v_not_b32_e32 v3, v3 1282; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1283; GFX10-NEXT: v_mov_b32_e32 v1, s1 1284; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo 1285; GFX10-NEXT: v_mov_b32_e32 v0, s0 1286; GFX10-NEXT: v_mov_b32_e32 v1, s1 1287; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 1288; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 1289; GFX10-NEXT: v_mov_b32_e32 v2, 0 1290; GFX10-NEXT: v_mov_b32_e32 v3, 0 1291; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 1292; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 1293; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1294; GFX10-NEXT: s_endpgm 1295; 1296; GFX11-LABEL: insertelement_s_v4i16_v_v: 1297; GFX11: ; %bb.0: 1298; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 1299; GFX11-NEXT: v_and_b32_e32 v2, 1, v1 1300; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v1 1301; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1302; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1303; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 1304; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1305; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 4, v2 1306; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo 1307; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) 1308; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff 1309; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 1310; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1311; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 1312; GFX11-NEXT: v_not_b32_e32 v3, v3 1313; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 1314; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2 1315; GFX11-NEXT: v_mov_b32_e32 v2, 0 1316; GFX11-NEXT: v_mov_b32_e32 v3, 0 1317; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 1318; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 1319; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1320; GFX11-NEXT: s_endpgm 1321 %vec = load <4 x i16>, ptr addrspace(4) %ptr 1322 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1323 store <4 x i16> %insert, ptr addrspace(1) null 1324 ret void 1325} 1326 1327define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) { 1328; GFX9-LABEL: insertelement_v_v4i16_s_v: 1329; GFX9: ; %bb.0: 1330; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1331; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v2 1332; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 1333; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 1334; GFX9-NEXT: s_and_b32 s0, s2, 0xffff 1335; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1336; GFX9-NEXT: v_lshlrev_b32_e64 v7, v2, s0 1337; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v5 1338; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 1339; GFX9-NEXT: v_not_b32_e32 v2, v2 1340; GFX9-NEXT: v_mov_b32_e32 v3, 0 1341; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 1342; GFX9-NEXT: v_mov_b32_e32 v4, 0 1343; GFX9-NEXT: s_waitcnt vmcnt(0) 1344; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1345; GFX9-NEXT: v_and_or_b32 v2, v5, v2, v7 1346; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1347; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1348; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off 1349; GFX9-NEXT: s_endpgm 1350; 1351; GFX8-LABEL: insertelement_v_v4i16_s_v: 1352; GFX8: ; %bb.0: 1353; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1354; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v2 1355; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 1356; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff 1357; GFX8-NEXT: s_and_b32 s0, s2, 0xffff 1358; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1359; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 1360; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 1361; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 1362; GFX8-NEXT: v_not_b32_e32 v2, v2 1363; GFX8-NEXT: v_mov_b32_e32 v3, 0 1364; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 1365; GFX8-NEXT: v_mov_b32_e32 v4, 0 1366; GFX8-NEXT: s_waitcnt vmcnt(0) 1367; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1368; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 1369; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 1370; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1371; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1372; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] 1373; GFX8-NEXT: s_endpgm 1374; 1375; GFX7-LABEL: insertelement_v_v4i16_s_v: 1376; GFX7: ; %bb.0: 1377; GFX7-NEXT: s_mov_b32 s6, 0 1378; GFX7-NEXT: s_mov_b32 s7, 0xf000 1379; GFX7-NEXT: s_mov_b64 s[4:5], 0 1380; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1381; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 1382; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 1383; GFX7-NEXT: s_and_b32 s0, s2, 0xffff 1384; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1385; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 1386; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 1387; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 1388; GFX7-NEXT: v_not_b32_e32 v2, v2 1389; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 1390; GFX7-NEXT: s_mov_b64 s[4:5], 0 1391; GFX7-NEXT: s_mov_b32 s6, -1 1392; GFX7-NEXT: s_waitcnt vmcnt(0) 1393; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1394; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 1395; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 1396; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1397; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1398; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1399; GFX7-NEXT: s_endpgm 1400; 1401; GFX10-LABEL: insertelement_v_v4i16_s_v: 1402; GFX10: ; %bb.0: 1403; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1404; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 1405; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 1406; GFX10-NEXT: s_and_b32 s0, s2, 0xffff 1407; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1408; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 1409; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 1410; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 1411; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 1412; GFX10-NEXT: v_not_b32_e32 v3, v4 1413; GFX10-NEXT: s_waitcnt vmcnt(0) 1414; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1415; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 1416; GFX10-NEXT: v_mov_b32_e32 v2, 0 1417; GFX10-NEXT: v_mov_b32_e32 v3, 0 1418; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1419; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1420; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1421; GFX10-NEXT: s_endpgm 1422; 1423; GFX11-LABEL: insertelement_v_v4i16_s_v: 1424; GFX11: ; %bb.0: 1425; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1426; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v2 1427; GFX11-NEXT: s_and_b32 s0, s2, 0xffff 1428; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1429; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 1430; GFX11-NEXT: v_and_b32_e32 v3, 1, v2 1431; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1432; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 1433; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 1434; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0 1435; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 1436; GFX11-NEXT: v_not_b32_e32 v3, v4 1437; GFX11-NEXT: s_waitcnt vmcnt(0) 1438; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1439; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 1440; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 1441; GFX11-NEXT: v_mov_b32_e32 v2, 0 1442; GFX11-NEXT: v_mov_b32_e32 v3, 0 1443; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1444; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1445; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1446; GFX11-NEXT: s_endpgm 1447 %vec = load <4 x i16>, ptr addrspace(1) %ptr 1448 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1449 store <4 x i16> %insert, ptr addrspace(1) null 1450 ret void 1451} 1452 1453define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) { 1454; GFX9-LABEL: insertelement_v_v4i16_v_s: 1455; GFX9: ; %bb.0: 1456; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1457; GFX9-NEXT: s_and_b32 s1, s2, 1 1458; GFX9-NEXT: s_lshr_b32 s0, s2, 1 1459; GFX9-NEXT: s_lshl_b32 s1, s1, 4 1460; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1461; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 1462; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 1463; GFX9-NEXT: s_not_b32 s1, s1 1464; GFX9-NEXT: v_mov_b32_e32 v3, 0 1465; GFX9-NEXT: v_mov_b32_e32 v4, 0 1466; GFX9-NEXT: s_waitcnt vmcnt(0) 1467; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1468; GFX9-NEXT: v_and_or_b32 v2, v5, s1, v2 1469; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 1470; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1471; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1472; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off 1473; GFX9-NEXT: s_endpgm 1474; 1475; GFX8-LABEL: insertelement_v_v4i16_v_s: 1476; GFX8: ; %bb.0: 1477; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1478; GFX8-NEXT: s_and_b32 s1, s2, 1 1479; GFX8-NEXT: s_lshr_b32 s0, s2, 1 1480; GFX8-NEXT: s_lshl_b32 s1, s1, 4 1481; GFX8-NEXT: v_mov_b32_e32 v5, s1 1482; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 1483; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 1484; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1485; GFX8-NEXT: s_not_b32 s1, s1 1486; GFX8-NEXT: v_mov_b32_e32 v3, 0 1487; GFX8-NEXT: v_mov_b32_e32 v4, 0 1488; GFX8-NEXT: s_waitcnt vmcnt(0) 1489; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1490; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 1491; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 1492; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 1493; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1494; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1495; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] 1496; GFX8-NEXT: s_endpgm 1497; 1498; GFX7-LABEL: insertelement_v_v4i16_v_s: 1499; GFX7: ; %bb.0: 1500; GFX7-NEXT: s_mov_b32 s6, 0 1501; GFX7-NEXT: s_mov_b32 s7, 0xf000 1502; GFX7-NEXT: s_mov_b64 s[4:5], 0 1503; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1504; GFX7-NEXT: s_and_b32 s1, s2, 1 1505; GFX7-NEXT: s_lshr_b32 s0, s2, 1 1506; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1507; GFX7-NEXT: s_lshl_b32 s1, s1, 4 1508; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 1509; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 1510; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 1511; GFX7-NEXT: s_not_b32 s1, s1 1512; GFX7-NEXT: s_mov_b64 s[4:5], 0 1513; GFX7-NEXT: s_mov_b32 s6, -1 1514; GFX7-NEXT: s_waitcnt vmcnt(0) 1515; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1516; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 1517; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 1518; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 1519; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1520; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1521; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1522; GFX7-NEXT: s_endpgm 1523; 1524; GFX10-LABEL: insertelement_v_v4i16_v_s: 1525; GFX10: ; %bb.0: 1526; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1527; GFX10-NEXT: s_lshr_b32 s1, s2, 1 1528; GFX10-NEXT: s_and_b32 s0, s2, 1 1529; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 1530; GFX10-NEXT: s_lshl_b32 s0, s0, 4 1531; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1532; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 1533; GFX10-NEXT: s_not_b32 s0, s0 1534; GFX10-NEXT: s_waitcnt vmcnt(0) 1535; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo 1536; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 1537; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 1538; GFX10-NEXT: v_mov_b32_e32 v2, 0 1539; GFX10-NEXT: v_mov_b32_e32 v3, 0 1540; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1541; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1542; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1543; GFX10-NEXT: s_endpgm 1544; 1545; GFX11-LABEL: insertelement_v_v4i16_v_s: 1546; GFX11: ; %bb.0: 1547; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1548; GFX11-NEXT: s_lshr_b32 s1, s2, 1 1549; GFX11-NEXT: s_and_b32 s0, s2, 1 1550; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 1551; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 1552; GFX11-NEXT: s_lshl_b32 s0, s0, 4 1553; GFX11-NEXT: s_waitcnt vmcnt(0) 1554; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1555; GFX11-NEXT: v_dual_cndmask_b32 v3, v0, v1 :: v_dual_lshlrev_b32 v2, s0, v2 1556; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 1557; GFX11-NEXT: s_not_b32 s0, s0 1558; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1559; GFX11-NEXT: v_and_or_b32 v4, v3, s0, v2 1560; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 1561; GFX11-NEXT: v_mov_b32_e32 v2, 0 1562; GFX11-NEXT: v_mov_b32_e32 v3, 0 1563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1564; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1565; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1566; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1567; GFX11-NEXT: s_endpgm 1568 %vec = load <4 x i16>, ptr addrspace(1) %ptr 1569 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1570 store <4 x i16> %insert, ptr addrspace(1) null 1571 ret void 1572} 1573 1574define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) { 1575; GFX9-LABEL: insertelement_v_v4i16_v_v: 1576; GFX9: ; %bb.0: 1577; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1578; GFX9-NEXT: v_lshrrev_b32_e32 v7, 1, v3 1579; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 1580; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff 1581; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1582; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1583; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 1584; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 1585; GFX9-NEXT: v_not_b32_e32 v3, v3 1586; GFX9-NEXT: v_mov_b32_e32 v4, 0 1587; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 1588; GFX9-NEXT: v_mov_b32_e32 v5, 0 1589; GFX9-NEXT: s_waitcnt vmcnt(0) 1590; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc 1591; GFX9-NEXT: v_and_or_b32 v2, v6, v3, v2 1592; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1593; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1594; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 1595; GFX9-NEXT: s_endpgm 1596; 1597; GFX8-LABEL: insertelement_v_v4i16_v_v: 1598; GFX8: ; %bb.0: 1599; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1600; GFX8-NEXT: v_lshrrev_b32_e32 v7, 1, v3 1601; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 1602; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff 1603; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1604; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1605; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 1606; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 1607; GFX8-NEXT: v_not_b32_e32 v3, v3 1608; GFX8-NEXT: v_mov_b32_e32 v4, 0 1609; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 1610; GFX8-NEXT: v_mov_b32_e32 v5, 0 1611; GFX8-NEXT: s_waitcnt vmcnt(0) 1612; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc 1613; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 1614; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 1615; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1616; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1617; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1618; GFX8-NEXT: s_endpgm 1619; 1620; GFX7-LABEL: insertelement_v_v4i16_v_v: 1621; GFX7: ; %bb.0: 1622; GFX7-NEXT: s_mov_b32 s6, 0 1623; GFX7-NEXT: s_mov_b32 s7, 0xf000 1624; GFX7-NEXT: s_mov_b64 s[4:5], 0 1625; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 1626; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v3 1627; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 1628; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1629; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 1630; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 1631; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 1632; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 1633; GFX7-NEXT: v_not_b32_e32 v3, v3 1634; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 1635; GFX7-NEXT: s_mov_b64 s[4:5], 0 1636; GFX7-NEXT: s_mov_b32 s6, -1 1637; GFX7-NEXT: s_waitcnt vmcnt(0) 1638; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1639; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 1640; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 1641; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1642; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1643; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1644; GFX7-NEXT: s_endpgm 1645; 1646; GFX10-LABEL: insertelement_v_v4i16_v_v: 1647; GFX10: ; %bb.0: 1648; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1649; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 1650; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 1651; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1652; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 1653; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 1654; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 1655; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1656; GFX10-NEXT: v_not_b32_e32 v3, v5 1657; GFX10-NEXT: s_waitcnt vmcnt(0) 1658; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1659; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 1660; GFX10-NEXT: v_mov_b32_e32 v2, 0 1661; GFX10-NEXT: v_mov_b32_e32 v3, 0 1662; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1663; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1664; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1665; GFX10-NEXT: s_endpgm 1666; 1667; GFX11-LABEL: insertelement_v_v4i16_v_v: 1668; GFX11: ; %bb.0: 1669; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1670; GFX11-NEXT: v_and_b32_e32 v4, 1, v3 1671; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v3 1672; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1673; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1674; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 1675; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 1676; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 1677; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 1678; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 1679; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 1680; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1681; GFX11-NEXT: v_not_b32_e32 v3, v5 1682; GFX11-NEXT: s_waitcnt vmcnt(0) 1683; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1684; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 1685; GFX11-NEXT: v_mov_b32_e32 v2, 0 1686; GFX11-NEXT: v_mov_b32_e32 v3, 0 1687; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 1688; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 1689; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 1690; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1691; GFX11-NEXT: s_endpgm 1692 %vec = load <4 x i16>, ptr addrspace(1) %ptr 1693 %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx 1694 store <4 x i16> %insert, ptr addrspace(1) null 1695 ret void 1696} 1697 1698define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) { 1699; GFX9-LABEL: insertelement_s_v8i16_s_s: 1700; GFX9: ; %bb.0: 1701; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1702; GFX9-NEXT: s_lshr_b32 s6, s5, 1 1703; GFX9-NEXT: s_cmp_eq_u32 s6, 1 1704; GFX9-NEXT: v_mov_b32_e32 v4, 0 1705; GFX9-NEXT: v_mov_b32_e32 v5, 0 1706; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1707; GFX9-NEXT: s_cselect_b32 s7, s1, s0 1708; GFX9-NEXT: s_cmp_eq_u32 s6, 2 1709; GFX9-NEXT: s_cselect_b32 s7, s2, s7 1710; GFX9-NEXT: s_cmp_eq_u32 s6, 3 1711; GFX9-NEXT: s_cselect_b32 s7, s3, s7 1712; GFX9-NEXT: s_and_b32 s5, s5, 1 1713; GFX9-NEXT: s_lshl_b32 s5, s5, 4 1714; GFX9-NEXT: s_and_b32 s4, s4, 0xffff 1715; GFX9-NEXT: s_lshl_b32 s4, s4, s5 1716; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s5 1717; GFX9-NEXT: s_andn2_b32 s5, s7, s5 1718; GFX9-NEXT: s_or_b32 s4, s5, s4 1719; GFX9-NEXT: s_cmp_eq_u32 s6, 0 1720; GFX9-NEXT: s_cselect_b32 s0, s4, s0 1721; GFX9-NEXT: s_cmp_eq_u32 s6, 1 1722; GFX9-NEXT: s_cselect_b32 s1, s4, s1 1723; GFX9-NEXT: s_cmp_eq_u32 s6, 2 1724; GFX9-NEXT: s_cselect_b32 s2, s4, s2 1725; GFX9-NEXT: s_cmp_eq_u32 s6, 3 1726; GFX9-NEXT: s_cselect_b32 s3, s4, s3 1727; GFX9-NEXT: v_mov_b32_e32 v0, s0 1728; GFX9-NEXT: v_mov_b32_e32 v1, s1 1729; GFX9-NEXT: v_mov_b32_e32 v2, s2 1730; GFX9-NEXT: v_mov_b32_e32 v3, s3 1731; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1732; GFX9-NEXT: s_endpgm 1733; 1734; GFX8-LABEL: insertelement_s_v8i16_s_s: 1735; GFX8: ; %bb.0: 1736; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1737; GFX8-NEXT: s_lshr_b32 s6, s5, 1 1738; GFX8-NEXT: s_cmp_eq_u32 s6, 1 1739; GFX8-NEXT: v_mov_b32_e32 v4, 0 1740; GFX8-NEXT: v_mov_b32_e32 v5, 0 1741; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1742; GFX8-NEXT: s_cselect_b32 s7, s1, s0 1743; GFX8-NEXT: s_cmp_eq_u32 s6, 2 1744; GFX8-NEXT: s_cselect_b32 s7, s2, s7 1745; GFX8-NEXT: s_cmp_eq_u32 s6, 3 1746; GFX8-NEXT: s_cselect_b32 s7, s3, s7 1747; GFX8-NEXT: s_and_b32 s5, s5, 1 1748; GFX8-NEXT: s_lshl_b32 s5, s5, 4 1749; GFX8-NEXT: s_and_b32 s4, s4, 0xffff 1750; GFX8-NEXT: s_lshl_b32 s4, s4, s5 1751; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s5 1752; GFX8-NEXT: s_andn2_b32 s5, s7, s5 1753; GFX8-NEXT: s_or_b32 s4, s5, s4 1754; GFX8-NEXT: s_cmp_eq_u32 s6, 0 1755; GFX8-NEXT: s_cselect_b32 s0, s4, s0 1756; GFX8-NEXT: s_cmp_eq_u32 s6, 1 1757; GFX8-NEXT: s_cselect_b32 s1, s4, s1 1758; GFX8-NEXT: s_cmp_eq_u32 s6, 2 1759; GFX8-NEXT: s_cselect_b32 s2, s4, s2 1760; GFX8-NEXT: s_cmp_eq_u32 s6, 3 1761; GFX8-NEXT: s_cselect_b32 s3, s4, s3 1762; GFX8-NEXT: v_mov_b32_e32 v0, s0 1763; GFX8-NEXT: v_mov_b32_e32 v1, s1 1764; GFX8-NEXT: v_mov_b32_e32 v2, s2 1765; GFX8-NEXT: v_mov_b32_e32 v3, s3 1766; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1767; GFX8-NEXT: s_endpgm 1768; 1769; GFX7-LABEL: insertelement_s_v8i16_s_s: 1770; GFX7: ; %bb.0: 1771; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1772; GFX7-NEXT: s_lshr_b32 s6, s5, 1 1773; GFX7-NEXT: s_cmp_eq_u32 s6, 1 1774; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1775; GFX7-NEXT: s_cselect_b32 s7, s1, s0 1776; GFX7-NEXT: s_cmp_eq_u32 s6, 2 1777; GFX7-NEXT: s_cselect_b32 s7, s2, s7 1778; GFX7-NEXT: s_cmp_eq_u32 s6, 3 1779; GFX7-NEXT: s_cselect_b32 s7, s3, s7 1780; GFX7-NEXT: s_and_b32 s5, s5, 1 1781; GFX7-NEXT: s_lshl_b32 s5, s5, 4 1782; GFX7-NEXT: s_and_b32 s4, s4, 0xffff 1783; GFX7-NEXT: s_lshl_b32 s4, s4, s5 1784; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s5 1785; GFX7-NEXT: s_andn2_b32 s5, s7, s5 1786; GFX7-NEXT: s_or_b32 s4, s5, s4 1787; GFX7-NEXT: s_cmp_eq_u32 s6, 0 1788; GFX7-NEXT: s_cselect_b32 s0, s4, s0 1789; GFX7-NEXT: s_cmp_eq_u32 s6, 1 1790; GFX7-NEXT: s_cselect_b32 s1, s4, s1 1791; GFX7-NEXT: s_cmp_eq_u32 s6, 2 1792; GFX7-NEXT: s_cselect_b32 s2, s4, s2 1793; GFX7-NEXT: s_cmp_eq_u32 s6, 3 1794; GFX7-NEXT: s_cselect_b32 s3, s4, s3 1795; GFX7-NEXT: v_mov_b32_e32 v0, s0 1796; GFX7-NEXT: s_mov_b64 s[4:5], 0 1797; GFX7-NEXT: v_mov_b32_e32 v1, s1 1798; GFX7-NEXT: v_mov_b32_e32 v2, s2 1799; GFX7-NEXT: v_mov_b32_e32 v3, s3 1800; GFX7-NEXT: s_mov_b32 s6, -1 1801; GFX7-NEXT: s_mov_b32 s7, 0xf000 1802; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1803; GFX7-NEXT: s_endpgm 1804; 1805; GFX10-LABEL: insertelement_s_v8i16_s_s: 1806; GFX10: ; %bb.0: 1807; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 1808; GFX10-NEXT: s_lshr_b32 s6, s5, 1 1809; GFX10-NEXT: v_mov_b32_e32 v4, 0 1810; GFX10-NEXT: s_cmp_eq_u32 s6, 1 1811; GFX10-NEXT: v_mov_b32_e32 v5, 0 1812; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1813; GFX10-NEXT: s_cselect_b32 s7, s1, s0 1814; GFX10-NEXT: s_cmp_eq_u32 s6, 2 1815; GFX10-NEXT: s_cselect_b32 s7, s2, s7 1816; GFX10-NEXT: s_cmp_eq_u32 s6, 3 1817; GFX10-NEXT: s_cselect_b32 s7, s3, s7 1818; GFX10-NEXT: s_and_b32 s5, s5, 1 1819; GFX10-NEXT: s_and_b32 s4, s4, 0xffff 1820; GFX10-NEXT: s_lshl_b32 s5, s5, 4 1821; GFX10-NEXT: s_lshl_b32 s8, 0xffff, s5 1822; GFX10-NEXT: s_lshl_b32 s4, s4, s5 1823; GFX10-NEXT: s_andn2_b32 s5, s7, s8 1824; GFX10-NEXT: s_or_b32 s4, s5, s4 1825; GFX10-NEXT: s_cmp_eq_u32 s6, 0 1826; GFX10-NEXT: s_cselect_b32 s0, s4, s0 1827; GFX10-NEXT: s_cmp_eq_u32 s6, 1 1828; GFX10-NEXT: s_cselect_b32 s1, s4, s1 1829; GFX10-NEXT: s_cmp_eq_u32 s6, 2 1830; GFX10-NEXT: s_cselect_b32 s2, s4, s2 1831; GFX10-NEXT: s_cmp_eq_u32 s6, 3 1832; GFX10-NEXT: s_cselect_b32 s3, s4, s3 1833; GFX10-NEXT: v_mov_b32_e32 v0, s0 1834; GFX10-NEXT: v_mov_b32_e32 v1, s1 1835; GFX10-NEXT: v_mov_b32_e32 v2, s2 1836; GFX10-NEXT: v_mov_b32_e32 v3, s3 1837; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1838; GFX10-NEXT: s_endpgm 1839; 1840; GFX11-LABEL: insertelement_s_v8i16_s_s: 1841; GFX11: ; %bb.0: 1842; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 1843; GFX11-NEXT: s_lshr_b32 s6, s5, 1 1844; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1845; GFX11-NEXT: s_cmp_eq_u32 s6, 1 1846; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1847; GFX11-NEXT: s_cselect_b32 s7, s1, s0 1848; GFX11-NEXT: s_cmp_eq_u32 s6, 2 1849; GFX11-NEXT: s_cselect_b32 s7, s2, s7 1850; GFX11-NEXT: s_cmp_eq_u32 s6, 3 1851; GFX11-NEXT: s_cselect_b32 s7, s3, s7 1852; GFX11-NEXT: s_and_b32 s5, s5, 1 1853; GFX11-NEXT: s_and_b32 s4, s4, 0xffff 1854; GFX11-NEXT: s_lshl_b32 s5, s5, 4 1855; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 1856; GFX11-NEXT: s_lshl_b32 s8, 0xffff, s5 1857; GFX11-NEXT: s_lshl_b32 s4, s4, s5 1858; GFX11-NEXT: s_and_not1_b32 s5, s7, s8 1859; GFX11-NEXT: s_or_b32 s4, s5, s4 1860; GFX11-NEXT: s_cmp_eq_u32 s6, 0 1861; GFX11-NEXT: s_cselect_b32 s0, s4, s0 1862; GFX11-NEXT: s_cmp_eq_u32 s6, 1 1863; GFX11-NEXT: s_cselect_b32 s1, s4, s1 1864; GFX11-NEXT: s_cmp_eq_u32 s6, 2 1865; GFX11-NEXT: s_cselect_b32 s2, s4, s2 1866; GFX11-NEXT: s_cmp_eq_u32 s6, 3 1867; GFX11-NEXT: s_cselect_b32 s3, s4, s3 1868; GFX11-NEXT: v_mov_b32_e32 v4, 0 1869; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 1870; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 1871; GFX11-NEXT: v_mov_b32_e32 v3, s3 1872; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 1873; GFX11-NEXT: s_endpgm 1874 %vec = load <8 x i16>, ptr addrspace(4) %ptr 1875 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 1876 store <8 x i16> %insert, ptr addrspace(1) null 1877 ret void 1878} 1879 1880define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) { 1881; GFX9-LABEL: insertelement_v_v8i16_s_s: 1882; GFX9: ; %bb.0: 1883; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1884; GFX9-NEXT: s_and_b32 s0, s3, 1 1885; GFX9-NEXT: s_lshr_b32 s4, s3, 1 1886; GFX9-NEXT: s_and_b32 s1, s2, 0xffff 1887; GFX9-NEXT: s_lshl_b32 s0, s0, 4 1888; GFX9-NEXT: s_lshl_b32 s1, s1, s0 1889; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 1890; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1891; GFX9-NEXT: s_not_b32 s5, s0 1892; GFX9-NEXT: v_mov_b32_e32 v6, s1 1893; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 1894; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 1895; GFX9-NEXT: v_mov_b32_e32 v4, 0 1896; GFX9-NEXT: v_mov_b32_e32 v5, 0 1897; GFX9-NEXT: s_waitcnt vmcnt(0) 1898; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc 1899; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1] 1900; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3] 1901; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6 1902; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 1903; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 1904; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1905; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1906; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 1907; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1908; GFX9-NEXT: s_endpgm 1909; 1910; GFX8-LABEL: insertelement_v_v8i16_s_s: 1911; GFX8: ; %bb.0: 1912; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1913; GFX8-NEXT: s_and_b32 s0, s3, 1 1914; GFX8-NEXT: s_lshr_b32 s4, s3, 1 1915; GFX8-NEXT: s_and_b32 s1, s2, 0xffff 1916; GFX8-NEXT: s_lshl_b32 s0, s0, 4 1917; GFX8-NEXT: s_lshl_b32 s5, s1, s0 1918; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 1919; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1920; GFX8-NEXT: s_not_b32 s6, s0 1921; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 1922; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 1923; GFX8-NEXT: v_mov_b32_e32 v4, 0 1924; GFX8-NEXT: v_mov_b32_e32 v5, 0 1925; GFX8-NEXT: s_waitcnt vmcnt(0) 1926; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc 1927; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] 1928; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] 1929; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 1930; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 1931; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 1932; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 1933; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1934; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1935; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 1936; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1937; GFX8-NEXT: s_endpgm 1938; 1939; GFX7-LABEL: insertelement_v_v8i16_s_s: 1940; GFX7: ; %bb.0: 1941; GFX7-NEXT: s_mov_b32 s10, 0 1942; GFX7-NEXT: s_mov_b32 s11, 0xf000 1943; GFX7-NEXT: s_mov_b64 s[8:9], 0 1944; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 1945; GFX7-NEXT: s_and_b32 s0, s3, 1 1946; GFX7-NEXT: s_lshr_b32 s4, s3, 1 1947; GFX7-NEXT: s_and_b32 s1, s2, 0xffff 1948; GFX7-NEXT: s_lshl_b32 s0, s0, 4 1949; GFX7-NEXT: s_lshl_b32 s5, s1, s0 1950; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 1951; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1952; GFX7-NEXT: s_not_b32 s6, s0 1953; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 1954; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 1955; GFX7-NEXT: s_mov_b64 s[8:9], 0 1956; GFX7-NEXT: s_mov_b32 s10, -1 1957; GFX7-NEXT: s_waitcnt vmcnt(0) 1958; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 1959; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] 1960; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] 1961; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 1962; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 1963; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 1964; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] 1965; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1966; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1967; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] 1968; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1969; GFX7-NEXT: s_endpgm 1970; 1971; GFX10-LABEL: insertelement_v_v8i16_s_s: 1972; GFX10: ; %bb.0: 1973; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1974; GFX10-NEXT: s_lshr_b32 s4, s3, 1 1975; GFX10-NEXT: s_and_b32 s1, s3, 1 1976; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 1977; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 1978; GFX10-NEXT: s_lshl_b32 s3, s1, 4 1979; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 1980; GFX10-NEXT: s_and_b32 s2, s2, 0xffff 1981; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s3 1982; GFX10-NEXT: s_lshl_b32 s2, s2, s3 1983; GFX10-NEXT: s_not_b32 s3, s5 1984; GFX10-NEXT: s_waitcnt vmcnt(0) 1985; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 1986; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 1987; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 1988; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 1989; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 1990; GFX10-NEXT: v_mov_b32_e32 v4, 0 1991; GFX10-NEXT: v_mov_b32_e32 v5, 0 1992; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo 1993; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 1994; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 1995; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 1996; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1997; GFX10-NEXT: s_endpgm 1998; 1999; GFX11-LABEL: insertelement_v_v8i16_s_s: 2000; GFX11: ; %bb.0: 2001; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 2002; GFX11-NEXT: s_lshr_b32 s4, s3, 1 2003; GFX11-NEXT: s_and_b32 s1, s3, 1 2004; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 2005; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 2006; GFX11-NEXT: s_lshl_b32 s3, s1, 4 2007; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 2008; GFX11-NEXT: s_and_b32 s2, s2, 0xffff 2009; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s3 2010; GFX11-NEXT: s_lshl_b32 s2, s2, s3 2011; GFX11-NEXT: s_not_b32 s3, s5 2012; GFX11-NEXT: s_waitcnt vmcnt(0) 2013; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo 2014; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2015; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 2016; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 2017; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 2018; GFX11-NEXT: v_and_or_b32 v6, v4, s3, s2 2019; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 2020; GFX11-NEXT: v_mov_b32_e32 v4, 0 2021; GFX11-NEXT: v_mov_b32_e32 v5, 0 2022; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo 2023; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 2024; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 2025; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 2026; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 2027; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2028; GFX11-NEXT: s_endpgm 2029 %vec = load <8 x i16>, ptr addrspace(1 ) %ptr 2030 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2031 store <8 x i16> %insert, ptr addrspace(1) null 2032 ret void 2033} 2034 2035define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) { 2036; GFX9-LABEL: insertelement_s_v8i16_v_s: 2037; GFX9: ; %bb.0: 2038; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 2039; GFX9-NEXT: s_lshr_b32 s5, s4, 1 2040; GFX9-NEXT: s_cmp_eq_u32 s5, 1 2041; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2042; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 2043; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2044; GFX9-NEXT: s_cselect_b32 s6, s1, s0 2045; GFX9-NEXT: s_cmp_eq_u32 s5, 2 2046; GFX9-NEXT: s_cselect_b32 s6, s2, s6 2047; GFX9-NEXT: s_cmp_eq_u32 s5, 3 2048; GFX9-NEXT: s_cselect_b32 s6, s3, s6 2049; GFX9-NEXT: s_and_b32 s4, s4, 1 2050; GFX9-NEXT: s_lshl_b32 s4, s4, 4 2051; GFX9-NEXT: s_lshl_b32 s7, 0xffff, s4 2052; GFX9-NEXT: s_andn2_b32 s6, s6, s7 2053; GFX9-NEXT: v_mov_b32_e32 v1, s6 2054; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 2055; GFX9-NEXT: v_mov_b32_e32 v0, s0 2056; GFX9-NEXT: v_mov_b32_e32 v1, s1 2057; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2058; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 2059; GFX9-NEXT: v_mov_b32_e32 v2, s2 2060; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2061; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 2062; GFX9-NEXT: v_mov_b32_e32 v3, s3 2063; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2064; GFX9-NEXT: v_mov_b32_e32 v4, 0 2065; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 2066; GFX9-NEXT: v_mov_b32_e32 v5, 0 2067; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2068; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2069; GFX9-NEXT: s_endpgm 2070; 2071; GFX8-LABEL: insertelement_s_v8i16_v_s: 2072; GFX8: ; %bb.0: 2073; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 2074; GFX8-NEXT: s_lshr_b32 s5, s4, 1 2075; GFX8-NEXT: s_cmp_eq_u32 s5, 1 2076; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 2077; GFX8-NEXT: v_mov_b32_e32 v4, 0 2078; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX8-NEXT: s_cselect_b32 s6, s1, s0 2080; GFX8-NEXT: s_cmp_eq_u32 s5, 2 2081; GFX8-NEXT: s_cselect_b32 s6, s2, s6 2082; GFX8-NEXT: s_cmp_eq_u32 s5, 3 2083; GFX8-NEXT: s_cselect_b32 s6, s3, s6 2084; GFX8-NEXT: s_and_b32 s4, s4, 1 2085; GFX8-NEXT: s_lshl_b32 s4, s4, 4 2086; GFX8-NEXT: v_mov_b32_e32 v1, s4 2087; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 2088; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2089; GFX8-NEXT: s_andn2_b32 s4, s6, s4 2090; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 2091; GFX8-NEXT: v_mov_b32_e32 v0, s0 2092; GFX8-NEXT: v_mov_b32_e32 v1, s1 2093; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2094; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 2095; GFX8-NEXT: v_mov_b32_e32 v2, s2 2096; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2097; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 2098; GFX8-NEXT: v_mov_b32_e32 v3, s3 2099; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2100; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 2101; GFX8-NEXT: v_mov_b32_e32 v5, 0 2102; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2103; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2104; GFX8-NEXT: s_endpgm 2105; 2106; GFX7-LABEL: insertelement_s_v8i16_v_s: 2107; GFX7: ; %bb.0: 2108; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 2109; GFX7-NEXT: s_lshr_b32 s5, s4, 1 2110; GFX7-NEXT: s_cmp_eq_u32 s5, 1 2111; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2112; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 2113; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2114; GFX7-NEXT: s_cselect_b32 s6, s1, s0 2115; GFX7-NEXT: s_cmp_eq_u32 s5, 2 2116; GFX7-NEXT: s_cselect_b32 s6, s2, s6 2117; GFX7-NEXT: s_cmp_eq_u32 s5, 3 2118; GFX7-NEXT: s_cselect_b32 s6, s3, s6 2119; GFX7-NEXT: s_and_b32 s4, s4, 1 2120; GFX7-NEXT: s_lshl_b32 s4, s4, 4 2121; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 2122; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 2123; GFX7-NEXT: s_andn2_b32 s4, s6, s4 2124; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 2125; GFX7-NEXT: v_mov_b32_e32 v0, s0 2126; GFX7-NEXT: v_mov_b32_e32 v1, s1 2127; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 2128; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 2129; GFX7-NEXT: v_mov_b32_e32 v2, s2 2130; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 2131; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 2132; GFX7-NEXT: v_mov_b32_e32 v3, s3 2133; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2134; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 2135; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2136; GFX7-NEXT: s_mov_b64 s[0:1], 0 2137; GFX7-NEXT: s_mov_b32 s2, -1 2138; GFX7-NEXT: s_mov_b32 s3, 0xf000 2139; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2140; GFX7-NEXT: s_endpgm 2141; 2142; GFX10-LABEL: insertelement_s_v8i16_v_s: 2143; GFX10: ; %bb.0: 2144; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 2145; GFX10-NEXT: s_lshr_b32 s5, s4, 1 2146; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 2147; GFX10-NEXT: s_cmp_eq_u32 s5, 1 2148; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 2149; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2150; GFX10-NEXT: s_cselect_b32 s6, s1, s0 2151; GFX10-NEXT: s_cmp_eq_u32 s5, 2 2152; GFX10-NEXT: v_mov_b32_e32 v0, s0 2153; GFX10-NEXT: s_cselect_b32 s6, s2, s6 2154; GFX10-NEXT: s_cmp_eq_u32 s5, 3 2155; GFX10-NEXT: v_mov_b32_e32 v1, s1 2156; GFX10-NEXT: s_cselect_b32 s6, s3, s6 2157; GFX10-NEXT: s_and_b32 s4, s4, 1 2158; GFX10-NEXT: v_mov_b32_e32 v2, s2 2159; GFX10-NEXT: s_lshl_b32 s4, s4, 4 2160; GFX10-NEXT: v_mov_b32_e32 v3, s3 2161; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 2162; GFX10-NEXT: s_andn2_b32 s6, s6, s7 2163; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 2164; GFX10-NEXT: v_mov_b32_e32 v4, 0 2165; GFX10-NEXT: v_mov_b32_e32 v5, 0 2166; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo 2167; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 2168; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo 2169; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 2170; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo 2171; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 2172; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo 2173; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2174; GFX10-NEXT: s_endpgm 2175; 2176; GFX11-LABEL: insertelement_s_v8i16_v_s: 2177; GFX11: ; %bb.0: 2178; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 2179; GFX11-NEXT: s_lshr_b32 s5, s4, 1 2180; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 2181; GFX11-NEXT: s_cmp_eq_u32 s5, 1 2182; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 2183; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX11-NEXT: s_cselect_b32 s6, s1, s0 2185; GFX11-NEXT: s_cmp_eq_u32 s5, 2 2186; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 2187; GFX11-NEXT: s_cselect_b32 s6, s2, s6 2188; GFX11-NEXT: s_cmp_eq_u32 s5, 3 2189; GFX11-NEXT: v_mov_b32_e32 v1, s1 2190; GFX11-NEXT: s_cselect_b32 s6, s3, s6 2191; GFX11-NEXT: s_and_b32 s4, s4, 1 2192; GFX11-NEXT: v_mov_b32_e32 v2, s2 2193; GFX11-NEXT: s_lshl_b32 s4, s4, 4 2194; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2195; GFX11-NEXT: s_lshl_b32 s7, 0xffff, s4 2196; GFX11-NEXT: s_and_not1_b32 s6, s6, s7 2197; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2198; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6 2199; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo 2200; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 2201; GFX11-NEXT: v_mov_b32_e32 v4, 0 2202; GFX11-NEXT: v_mov_b32_e32 v5, 0 2203; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo 2204; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 2205; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo 2206; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 2207; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo 2208; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2209; GFX11-NEXT: s_endpgm 2210 %vec = load <8 x i16>, ptr addrspace(4) %ptr 2211 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2212 store <8 x i16> %insert, ptr addrspace(1) null 2213 ret void 2214} 2215 2216define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) { 2217; GFX9-LABEL: insertelement_s_v8i16_s_v: 2218; GFX9: ; %bb.0: 2219; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 2220; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 2221; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2222; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2223; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 2224; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2225; GFX9-NEXT: v_mov_b32_e32 v1, s8 2226; GFX9-NEXT: v_mov_b32_e32 v2, s9 2227; GFX9-NEXT: v_mov_b32_e32 v3, s10 2228; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2229; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 2230; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2231; GFX9-NEXT: s_and_b32 s4, s4, 0xffff 2232; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 2233; GFX9-NEXT: v_mov_b32_e32 v5, s11 2234; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2235; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 2236; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v3 2237; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 2238; GFX9-NEXT: v_not_b32_e32 v0, v0 2239; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 2240; GFX9-NEXT: v_mov_b32_e32 v0, s8 2241; GFX9-NEXT: v_mov_b32_e32 v1, s9 2242; GFX9-NEXT: v_mov_b32_e32 v2, s10 2243; GFX9-NEXT: v_mov_b32_e32 v3, s11 2244; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2245; GFX9-NEXT: v_mov_b32_e32 v4, 0 2246; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2247; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2248; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2249; GFX9-NEXT: v_mov_b32_e32 v5, 0 2250; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 2251; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2252; GFX9-NEXT: s_endpgm 2253; 2254; GFX8-LABEL: insertelement_s_v8i16_s_v: 2255; GFX8: ; %bb.0: 2256; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 2257; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 2258; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2259; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2260; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 2261; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2262; GFX8-NEXT: v_mov_b32_e32 v1, s8 2263; GFX8-NEXT: v_mov_b32_e32 v2, s9 2264; GFX8-NEXT: v_mov_b32_e32 v3, s10 2265; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2266; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 2267; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2268; GFX8-NEXT: s_and_b32 s4, s4, 0xffff 2269; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff 2270; GFX8-NEXT: v_mov_b32_e32 v5, s11 2271; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2272; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 2273; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 2274; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 2275; GFX8-NEXT: v_not_b32_e32 v0, v0 2276; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 2277; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 2278; GFX8-NEXT: v_mov_b32_e32 v0, s8 2279; GFX8-NEXT: v_mov_b32_e32 v1, s9 2280; GFX8-NEXT: v_mov_b32_e32 v2, s10 2281; GFX8-NEXT: v_mov_b32_e32 v3, s11 2282; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2283; GFX8-NEXT: v_mov_b32_e32 v4, 0 2284; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2285; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2286; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2287; GFX8-NEXT: v_mov_b32_e32 v5, 0 2288; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 2289; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2290; GFX8-NEXT: s_endpgm 2291; 2292; GFX7-LABEL: insertelement_s_v8i16_s_v: 2293; GFX7: ; %bb.0: 2294; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 2295; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 2296; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2297; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 2298; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2299; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2300; GFX7-NEXT: v_mov_b32_e32 v1, s8 2301; GFX7-NEXT: v_mov_b32_e32 v2, s9 2302; GFX7-NEXT: v_mov_b32_e32 v3, s10 2303; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2304; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2305; GFX7-NEXT: s_and_b32 s4, s4, 0xffff 2306; GFX7-NEXT: v_mov_b32_e32 v5, s11 2307; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 2308; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2309; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 2310; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 2311; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 2312; GFX7-NEXT: v_not_b32_e32 v0, v0 2313; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 2314; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 2315; GFX7-NEXT: v_mov_b32_e32 v0, s8 2316; GFX7-NEXT: v_mov_b32_e32 v1, s9 2317; GFX7-NEXT: v_mov_b32_e32 v2, s10 2318; GFX7-NEXT: v_mov_b32_e32 v3, s11 2319; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2320; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] 2321; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2322; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2323; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] 2324; GFX7-NEXT: s_mov_b64 s[0:1], 0 2325; GFX7-NEXT: s_mov_b32 s2, -1 2326; GFX7-NEXT: s_mov_b32 s3, 0xf000 2327; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2328; GFX7-NEXT: s_endpgm 2329; 2330; GFX10-LABEL: insertelement_s_v8i16_s_v: 2331; GFX10: ; %bb.0: 2332; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 2333; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 2334; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 2335; GFX10-NEXT: s_and_b32 s1, s4, 0xffff 2336; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 2337; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2338; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 2339; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 2340; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff 2341; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 2342; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 2343; GFX10-NEXT: v_not_b32_e32 v5, v2 2344; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2345; GFX10-NEXT: v_mov_b32_e32 v0, s9 2346; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo 2347; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 2348; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 2349; GFX10-NEXT: v_mov_b32_e32 v0, s8 2350; GFX10-NEXT: v_mov_b32_e32 v1, s9 2351; GFX10-NEXT: v_mov_b32_e32 v2, s10 2352; GFX10-NEXT: v_mov_b32_e32 v3, s11 2353; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 2354; GFX10-NEXT: v_mov_b32_e32 v4, 0 2355; GFX10-NEXT: v_mov_b32_e32 v5, 0 2356; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 2357; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo 2358; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 2359; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 2360; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2361; GFX10-NEXT: s_endpgm 2362; 2363; GFX11-LABEL: insertelement_s_v8i16_s_v: 2364; GFX11: ; %bb.0: 2365; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 2366; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v0 2367; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 2368; GFX11-NEXT: s_and_b32 s1, s4, 0xffff 2369; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2370; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 2371; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 2372; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 2373; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2374; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1 2375; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo 2376; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 2377; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff 2378; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 2379; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 2380; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 2381; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 2382; GFX11-NEXT: v_not_b32_e32 v5, v2 2383; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 2384; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 2385; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 2386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 2387; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 2388; GFX11-NEXT: v_mov_b32_e32 v4, 0 2389; GFX11-NEXT: v_mov_b32_e32 v5, 0 2390; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo 2391; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 2392; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 2393; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 2394; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2395; GFX11-NEXT: s_endpgm 2396 %vec = load <8 x i16>, ptr addrspace(4) %ptr 2397 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2398 store <8 x i16> %insert, ptr addrspace(1) null 2399 ret void 2400} 2401 2402define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) { 2403; GFX9-LABEL: insertelement_s_v8i16_v_v: 2404; GFX9: ; %bb.0: 2405; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2406; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v1 2407; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2408; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 2409; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2411; GFX9-NEXT: v_mov_b32_e32 v2, s4 2412; GFX9-NEXT: v_mov_b32_e32 v3, s5 2413; GFX9-NEXT: v_mov_b32_e32 v5, s6 2414; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2415; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2416; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 2417; GFX9-NEXT: v_mov_b32_e32 v6, s7 2418; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2419; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2420; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2421; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 2422; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2423; GFX9-NEXT: v_not_b32_e32 v1, v1 2424; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 2425; GFX9-NEXT: v_mov_b32_e32 v0, s4 2426; GFX9-NEXT: v_mov_b32_e32 v1, s5 2427; GFX9-NEXT: v_mov_b32_e32 v2, s6 2428; GFX9-NEXT: v_mov_b32_e32 v3, s7 2429; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2430; GFX9-NEXT: v_mov_b32_e32 v4, 0 2431; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2432; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2433; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2434; GFX9-NEXT: v_mov_b32_e32 v5, 0 2435; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 2436; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2437; GFX9-NEXT: s_endpgm 2438; 2439; GFX8-LABEL: insertelement_s_v8i16_v_v: 2440; GFX8: ; %bb.0: 2441; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2442; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 2443; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2444; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 2445; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2446; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2447; GFX8-NEXT: v_mov_b32_e32 v2, s4 2448; GFX8-NEXT: v_mov_b32_e32 v3, s5 2449; GFX8-NEXT: v_mov_b32_e32 v5, s6 2450; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2451; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2452; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff 2453; GFX8-NEXT: v_mov_b32_e32 v6, s7 2454; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2455; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2456; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2457; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 2458; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2459; GFX8-NEXT: v_not_b32_e32 v1, v1 2460; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 2461; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 2462; GFX8-NEXT: v_mov_b32_e32 v0, s4 2463; GFX8-NEXT: v_mov_b32_e32 v1, s5 2464; GFX8-NEXT: v_mov_b32_e32 v2, s6 2465; GFX8-NEXT: v_mov_b32_e32 v3, s7 2466; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2467; GFX8-NEXT: v_mov_b32_e32 v4, 0 2468; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] 2469; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2470; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2471; GFX8-NEXT: v_mov_b32_e32 v5, 0 2472; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] 2473; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2474; GFX8-NEXT: s_endpgm 2475; 2476; GFX7-LABEL: insertelement_s_v8i16_v_v: 2477; GFX7: ; %bb.0: 2478; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2479; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 2480; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 2481; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 2482; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 2483; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2484; GFX7-NEXT: v_mov_b32_e32 v2, s4 2485; GFX7-NEXT: v_mov_b32_e32 v3, s5 2486; GFX7-NEXT: v_mov_b32_e32 v5, s6 2487; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2488; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2489; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2490; GFX7-NEXT: v_mov_b32_e32 v6, s7 2491; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2492; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 2493; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 2494; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 2495; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2496; GFX7-NEXT: v_not_b32_e32 v1, v1 2497; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 2498; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 2499; GFX7-NEXT: v_mov_b32_e32 v0, s4 2500; GFX7-NEXT: v_mov_b32_e32 v1, s5 2501; GFX7-NEXT: v_mov_b32_e32 v2, s6 2502; GFX7-NEXT: v_mov_b32_e32 v3, s7 2503; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 2504; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] 2505; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2506; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2507; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] 2508; GFX7-NEXT: s_mov_b64 s[0:1], 0 2509; GFX7-NEXT: s_mov_b32 s2, -1 2510; GFX7-NEXT: s_mov_b32 s3, 0xf000 2511; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2512; GFX7-NEXT: s_endpgm 2513; 2514; GFX10-LABEL: insertelement_s_v8i16_v_v: 2515; GFX10: ; %bb.0: 2516; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 2517; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 2518; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 2519; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 2520; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 2521; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 2522; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 2523; GFX10-NEXT: s_mov_b32 null, 0 2524; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 2525; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff 2526; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2527; GFX10-NEXT: v_not_b32_e32 v5, v3 2528; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2529; GFX10-NEXT: v_mov_b32_e32 v1, s5 2530; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo 2531; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 2532; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 2533; GFX10-NEXT: v_mov_b32_e32 v0, s4 2534; GFX10-NEXT: v_mov_b32_e32 v1, s5 2535; GFX10-NEXT: v_mov_b32_e32 v2, s6 2536; GFX10-NEXT: v_mov_b32_e32 v3, s7 2537; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 2538; GFX10-NEXT: v_mov_b32_e32 v4, 0 2539; GFX10-NEXT: v_mov_b32_e32 v5, 0 2540; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 2541; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo 2542; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 2543; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 2544; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 2545; GFX10-NEXT: s_endpgm 2546; 2547; GFX11-LABEL: insertelement_s_v8i16_v_v: 2548; GFX11: ; %bb.0: 2549; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 2550; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v1 2551; GFX11-NEXT: v_and_b32_e32 v2, 1, v1 2552; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 2553; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2554; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 2555; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 2556; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 2557; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 2558; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2559; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v2, 4, v2 2560; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 2561; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo 2562; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff 2563; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 2564; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2565; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 2566; GFX11-NEXT: v_not_b32_e32 v5, v3 2567; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) 2568; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 2569; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2570; GFX11-NEXT: v_mov_b32_e32 v3, s7 2571; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 2572; GFX11-NEXT: v_mov_b32_e32 v2, s6 2573; GFX11-NEXT: v_mov_b32_e32 v4, 0 2574; GFX11-NEXT: v_mov_b32_e32 v5, 0 2575; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 2576; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo 2577; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 2578; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 2579; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 2580; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2581; GFX11-NEXT: s_endpgm 2582 %vec = load <8 x i16>, ptr addrspace(4) %ptr 2583 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2584 store <8 x i16> %insert, ptr addrspace(1) null 2585 ret void 2586} 2587 2588define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) { 2589; GFX9-LABEL: insertelement_v_v8i16_s_v: 2590; GFX9: ; %bb.0: 2591; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2592; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v2 2593; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 2594; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 2595; GFX9-NEXT: s_and_b32 s0, s2, 0xffff 2596; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 2597; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 2598; GFX9-NEXT: v_lshlrev_b32_e64 v9, v2, s0 2599; GFX9-NEXT: v_lshlrev_b32_e32 v0, v2, v0 2600; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 2601; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 2602; GFX9-NEXT: v_not_b32_e32 v0, v0 2603; GFX9-NEXT: v_mov_b32_e32 v7, 0 2604; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2605; GFX9-NEXT: v_mov_b32_e32 v8, 0 2606; GFX9-NEXT: s_waitcnt vmcnt(0) 2607; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 2608; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2609; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2610; GFX9-NEXT: v_and_or_b32 v9, v2, v0, v9 2611; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2612; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2613; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2614; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2615; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2616; GFX9-NEXT: s_endpgm 2617; 2618; GFX8-LABEL: insertelement_v_v8i16_s_v: 2619; GFX8: ; %bb.0: 2620; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 2621; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v2 2622; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 2623; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff 2624; GFX8-NEXT: s_and_b32 s0, s2, 0xffff 2625; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 2626; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 2627; GFX8-NEXT: v_lshlrev_b32_e64 v9, v2, s0 2628; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 2629; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 2630; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 2631; GFX8-NEXT: v_not_b32_e32 v0, v0 2632; GFX8-NEXT: v_mov_b32_e32 v7, 0 2633; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2634; GFX8-NEXT: v_mov_b32_e32 v8, 0 2635; GFX8-NEXT: s_waitcnt vmcnt(0) 2636; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 2637; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2638; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 2639; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 2640; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 2641; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2642; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2643; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2644; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2645; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] 2646; GFX8-NEXT: s_endpgm 2647; 2648; GFX7-LABEL: insertelement_v_v8i16_s_v: 2649; GFX7: ; %bb.0: 2650; GFX7-NEXT: s_mov_b32 s10, 0 2651; GFX7-NEXT: s_mov_b32 s11, 0xf000 2652; GFX7-NEXT: s_mov_b64 s[8:9], 0 2653; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 2654; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 2655; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 2656; GFX7-NEXT: s_and_b32 s0, s2, 0xffff 2657; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2658; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2659; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 2660; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2661; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 2662; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2663; GFX7-NEXT: v_not_b32_e32 v1, v1 2664; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2665; GFX7-NEXT: s_mov_b64 s[8:9], 0 2666; GFX7-NEXT: s_mov_b32 s10, -1 2667; GFX7-NEXT: s_waitcnt vmcnt(0) 2668; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc 2669; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] 2670; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] 2671; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 2672; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 2673; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] 2674; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc 2675; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] 2676; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] 2677; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2678; GFX7-NEXT: s_endpgm 2679; 2680; GFX10-LABEL: insertelement_v_v8i16_s_v: 2681; GFX10: ; %bb.0: 2682; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2683; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 2684; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 2685; GFX10-NEXT: s_and_b32 s1, s2, 0xffff 2686; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 2687; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2688; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 2689; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 2690; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff 2691; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 2692; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 2693; GFX10-NEXT: v_not_b32_e32 v7, v7 2694; GFX10-NEXT: s_waitcnt vmcnt(0) 2695; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo 2696; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 2697; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 2698; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 2699; GFX10-NEXT: v_mov_b32_e32 v7, 0 2700; GFX10-NEXT: v_mov_b32_e32 v8, 0 2701; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 2702; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 2703; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 2704; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 2705; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2706; GFX10-NEXT: s_endpgm 2707; 2708; GFX11-LABEL: insertelement_v_v8i16_s_v: 2709; GFX11: ; %bb.0: 2710; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off 2711; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v2 2712; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 2713; GFX11-NEXT: s_and_b32 s1, s2, 0xffff 2714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2715; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 2716; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2717; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 2718; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 2719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 2720; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff 2721; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 2722; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 2723; GFX11-NEXT: v_not_b32_e32 v7, v7 2724; GFX11-NEXT: s_waitcnt vmcnt(0) 2725; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo 2726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2727; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 2728; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 2729; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2730; GFX11-NEXT: v_and_or_b32 v9, v2, v7, v0 2731; GFX11-NEXT: v_mov_b32_e32 v7, 0 2732; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v4, v9 2733; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 2734; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 2735; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 2736; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off 2737; GFX11-NEXT: s_endpgm 2738 %vec = load <8 x i16>, ptr addrspace(1) %ptr 2739 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2740 store <8 x i16> %insert, ptr addrspace(1) null 2741 ret void 2742} 2743 2744define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) { 2745; GFX9-LABEL: insertelement_v_v8i16_v_s: 2746; GFX9: ; %bb.0: 2747; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2748; GFX9-NEXT: s_and_b32 s0, s2, 1 2749; GFX9-NEXT: s_lshr_b32 s4, s2, 1 2750; GFX9-NEXT: s_lshl_b32 s0, s0, 4 2751; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2752; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 2753; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 2754; GFX9-NEXT: s_not_b32 s5, s0 2755; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 2756; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 2757; GFX9-NEXT: v_mov_b32_e32 v7, 0 2758; GFX9-NEXT: v_mov_b32_e32 v8, 0 2759; GFX9-NEXT: s_waitcnt vmcnt(0) 2760; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2761; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 2762; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 2763; GFX9-NEXT: v_and_or_b32 v9, v1, s5, v0 2764; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 2765; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2766; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2767; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2768; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2769; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2770; GFX9-NEXT: s_endpgm 2771; 2772; GFX8-LABEL: insertelement_v_v8i16_v_s: 2773; GFX8: ; %bb.0: 2774; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 2775; GFX8-NEXT: s_and_b32 s0, s2, 1 2776; GFX8-NEXT: s_lshr_b32 s4, s2, 1 2777; GFX8-NEXT: s_lshl_b32 s0, s0, 4 2778; GFX8-NEXT: v_mov_b32_e32 v0, s0 2779; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 2780; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 2781; GFX8-NEXT: s_not_b32 s5, s0 2782; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 2783; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 2784; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2785; GFX8-NEXT: v_mov_b32_e32 v7, 0 2786; GFX8-NEXT: v_mov_b32_e32 v8, 0 2787; GFX8-NEXT: s_waitcnt vmcnt(0) 2788; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2789; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 2790; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 2791; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 2792; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 2793; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 2794; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] 2795; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc 2796; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] 2797; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] 2798; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] 2799; GFX8-NEXT: s_endpgm 2800; 2801; GFX7-LABEL: insertelement_v_v8i16_v_s: 2802; GFX7: ; %bb.0: 2803; GFX7-NEXT: s_mov_b32 s10, 0 2804; GFX7-NEXT: s_mov_b32 s11, 0xf000 2805; GFX7-NEXT: s_mov_b64 s[8:9], 0 2806; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 2807; GFX7-NEXT: s_and_b32 s0, s2, 1 2808; GFX7-NEXT: s_lshr_b32 s4, s2, 1 2809; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 2810; GFX7-NEXT: s_lshl_b32 s0, s0, 4 2811; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 2812; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 2813; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 2814; GFX7-NEXT: s_not_b32 s5, s0 2815; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 2816; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 2817; GFX7-NEXT: s_mov_b64 s[8:9], 0 2818; GFX7-NEXT: s_mov_b32 s10, -1 2819; GFX7-NEXT: s_waitcnt vmcnt(0) 2820; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2821; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 2822; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 2823; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 2824; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 2825; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 2826; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] 2827; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc 2828; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] 2829; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] 2830; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2831; GFX7-NEXT: s_endpgm 2832; 2833; GFX10-LABEL: insertelement_v_v8i16_v_s: 2834; GFX10: ; %bb.0: 2835; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 2836; GFX10-NEXT: s_lshr_b32 s3, s2, 1 2837; GFX10-NEXT: s_and_b32 s1, s2, 1 2838; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 2839; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 2840; GFX10-NEXT: s_lshl_b32 s2, s1, 4 2841; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 2842; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2843; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s2 2844; GFX10-NEXT: v_mov_b32_e32 v7, 0 2845; GFX10-NEXT: s_not_b32 s2, s2 2846; GFX10-NEXT: v_mov_b32_e32 v8, 0 2847; GFX10-NEXT: s_waitcnt vmcnt(0) 2848; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo 2849; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 2850; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 2851; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 2852; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 2853; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 2854; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 2855; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 2856; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 2857; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2858; GFX10-NEXT: s_endpgm 2859; 2860; GFX11-LABEL: insertelement_v_v8i16_v_s: 2861; GFX11: ; %bb.0: 2862; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off 2863; GFX11-NEXT: s_lshr_b32 s3, s2, 1 2864; GFX11-NEXT: s_and_b32 s1, s2, 1 2865; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 2866; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 2867; GFX11-NEXT: s_lshl_b32 s2, s1, 4 2868; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 2869; GFX11-NEXT: v_mov_b32_e32 v7, 0 2870; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v1, 0xffff, v2 2871; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2872; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 2873; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s2 2874; GFX11-NEXT: s_not_b32 s2, s2 2875; GFX11-NEXT: s_waitcnt vmcnt(0) 2876; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo 2877; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2878; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 2879; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 2880; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2881; GFX11-NEXT: v_and_or_b32 v9, v0, s2, v1 2882; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 2883; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 2884; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2885; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 2886; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 2887; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 2888; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off 2889; GFX11-NEXT: s_endpgm 2890 %vec = load <8 x i16>, ptr addrspace(1) %ptr 2891 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 2892 store <8 x i16> %insert, ptr addrspace(1) null 2893 ret void 2894} 2895 2896define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) { 2897; GFX9-LABEL: insertelement_v_v8i16_v_v: 2898; GFX9: ; %bb.0: 2899; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 2900; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v3 2901; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 2902; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 2903; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 2904; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 2905; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2906; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 2907; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 2908; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 2909; GFX9-NEXT: v_not_b32_e32 v0, v0 2910; GFX9-NEXT: v_mov_b32_e32 v8, 0 2911; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2912; GFX9-NEXT: v_mov_b32_e32 v9, 0 2913; GFX9-NEXT: s_waitcnt vmcnt(0) 2914; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2915; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2916; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 2917; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v2 2918; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] 2919; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 2920; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] 2921; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 2922; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 2923; GFX9-NEXT: s_endpgm 2924; 2925; GFX8-LABEL: insertelement_v_v8i16_v_v: 2926; GFX8: ; %bb.0: 2927; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 2928; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v3 2929; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 2930; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff 2931; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 2932; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 2933; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2934; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 2935; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 2936; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 2937; GFX8-NEXT: v_not_b32_e32 v0, v0 2938; GFX8-NEXT: v_mov_b32_e32 v8, 0 2939; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2940; GFX8-NEXT: v_mov_b32_e32 v9, 0 2941; GFX8-NEXT: s_waitcnt vmcnt(0) 2942; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2943; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2944; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 2945; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 2946; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 2947; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] 2948; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 2949; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] 2950; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 2951; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2952; GFX8-NEXT: s_endpgm 2953; 2954; GFX7-LABEL: insertelement_v_v8i16_v_v: 2955; GFX7: ; %bb.0: 2956; GFX7-NEXT: s_mov_b32 s10, 0 2957; GFX7-NEXT: s_mov_b32 s11, 0xf000 2958; GFX7-NEXT: s_mov_b64 s[8:9], 0 2959; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 2960; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 2961; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 2962; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2963; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2964; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2965; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 2966; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 2967; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 2968; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 2969; GFX7-NEXT: v_not_b32_e32 v1, v1 2970; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 2971; GFX7-NEXT: s_mov_b64 s[8:9], 0 2972; GFX7-NEXT: s_mov_b32 s10, -1 2973; GFX7-NEXT: s_waitcnt vmcnt(0) 2974; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2975; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2976; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 2977; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 2978; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 2979; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] 2980; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 2981; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] 2982; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 2983; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2984; GFX7-NEXT: s_endpgm 2985; 2986; GFX10-LABEL: insertelement_v_v8i16_v_v: 2987; GFX10: ; %bb.0: 2988; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 2989; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 2990; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 2991; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 2992; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2993; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 2994; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 2995; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 2996; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff 2997; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2998; GFX10-NEXT: v_not_b32_e32 v2, v8 2999; GFX10-NEXT: v_mov_b32_e32 v8, 0 3000; GFX10-NEXT: v_mov_b32_e32 v9, 0 3001; GFX10-NEXT: s_waitcnt vmcnt(0) 3002; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo 3003; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 3004; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 3005; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 3006; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 3007; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo 3008; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 3009; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 3010; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3011; GFX10-NEXT: s_endpgm 3012; 3013; GFX11-LABEL: insertelement_v_v8i16_v_v: 3014; GFX11: ; %bb.0: 3015; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off 3016; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v3 3017; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3018; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 3019; GFX11-NEXT: v_and_b32_e32 v0, 1, v3 3020; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 3021; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 3022; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 3023; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 3024; GFX11-NEXT: s_waitcnt vmcnt(0) 3025; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_lshlrev_b32 v0, 4, v0 3026; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 3027; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 3028; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff 3029; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2 3030; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3031; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 3032; GFX11-NEXT: v_not_b32_e32 v2, v8 3033; GFX11-NEXT: v_mov_b32_e32 v8, 0 3034; GFX11-NEXT: v_mov_b32_e32 v9, 0 3035; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3036; GFX11-NEXT: v_and_or_b32 v3, v3, v2, v0 3037; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 3038; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo 3039; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 3040; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 3041; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 3042; GFX11-NEXT: s_endpgm 3043 %vec = load <8 x i16>, ptr addrspace(1) %ptr 3044 %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx 3045 store <8 x i16> %insert, ptr addrspace(1) null 3046 ret void 3047} 3048 3049define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) { 3050; GFX9-LABEL: insertelement_s_v16i16_s_s: 3051; GFX9: ; %bb.0: 3052; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3053; GFX9-NEXT: s_lshr_b32 s7, s5, 1 3054; GFX9-NEXT: s_cmp_eq_u32 s7, 1 3055; GFX9-NEXT: v_mov_b32_e32 v4, 0 3056; GFX9-NEXT: v_mov_b32_e32 v5, 0 3057; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3058; GFX9-NEXT: s_cselect_b32 s0, s9, s8 3059; GFX9-NEXT: s_cmp_eq_u32 s7, 2 3060; GFX9-NEXT: s_cselect_b32 s0, s10, s0 3061; GFX9-NEXT: s_cmp_eq_u32 s7, 3 3062; GFX9-NEXT: s_cselect_b32 s0, s11, s0 3063; GFX9-NEXT: s_cmp_eq_u32 s7, 4 3064; GFX9-NEXT: s_cselect_b32 s0, s12, s0 3065; GFX9-NEXT: s_cmp_eq_u32 s7, 5 3066; GFX9-NEXT: s_cselect_b32 s0, s13, s0 3067; GFX9-NEXT: s_cmp_eq_u32 s7, 6 3068; GFX9-NEXT: s_cselect_b32 s0, s14, s0 3069; GFX9-NEXT: s_cmp_eq_u32 s7, 7 3070; GFX9-NEXT: s_cselect_b32 s0, s15, s0 3071; GFX9-NEXT: s_and_b32 s1, s5, 1 3072; GFX9-NEXT: s_lshl_b32 s1, s1, 4 3073; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 3074; GFX9-NEXT: s_lshl_b32 s2, s2, s1 3075; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 3076; GFX9-NEXT: s_andn2_b32 s0, s0, s1 3077; GFX9-NEXT: s_or_b32 s16, s0, s2 3078; GFX9-NEXT: s_cmp_eq_u32 s7, 0 3079; GFX9-NEXT: s_cselect_b32 s0, s16, s8 3080; GFX9-NEXT: s_cmp_eq_u32 s7, 1 3081; GFX9-NEXT: s_cselect_b32 s1, s16, s9 3082; GFX9-NEXT: s_cmp_eq_u32 s7, 2 3083; GFX9-NEXT: s_cselect_b32 s2, s16, s10 3084; GFX9-NEXT: s_cmp_eq_u32 s7, 3 3085; GFX9-NEXT: s_cselect_b32 s3, s16, s11 3086; GFX9-NEXT: s_cmp_eq_u32 s7, 4 3087; GFX9-NEXT: s_cselect_b32 s4, s16, s12 3088; GFX9-NEXT: s_cmp_eq_u32 s7, 5 3089; GFX9-NEXT: s_cselect_b32 s5, s16, s13 3090; GFX9-NEXT: s_cmp_eq_u32 s7, 6 3091; GFX9-NEXT: v_mov_b32_e32 v0, s0 3092; GFX9-NEXT: s_cselect_b32 s6, s16, s14 3093; GFX9-NEXT: s_cmp_eq_u32 s7, 7 3094; GFX9-NEXT: v_mov_b32_e32 v1, s1 3095; GFX9-NEXT: v_mov_b32_e32 v2, s2 3096; GFX9-NEXT: v_mov_b32_e32 v3, s3 3097; GFX9-NEXT: s_cselect_b32 s7, s16, s15 3098; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 3099; GFX9-NEXT: v_mov_b32_e32 v4, 16 3100; GFX9-NEXT: v_mov_b32_e32 v0, s4 3101; GFX9-NEXT: v_mov_b32_e32 v5, 0 3102; GFX9-NEXT: v_mov_b32_e32 v1, s5 3103; GFX9-NEXT: v_mov_b32_e32 v2, s6 3104; GFX9-NEXT: v_mov_b32_e32 v3, s7 3105; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 3106; GFX9-NEXT: s_endpgm 3107; 3108; GFX8-LABEL: insertelement_s_v16i16_s_s: 3109; GFX8: ; %bb.0: 3110; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3111; GFX8-NEXT: s_and_b32 s1, s5, 1 3112; GFX8-NEXT: s_lshr_b32 m0, s5, 1 3113; GFX8-NEXT: s_lshl_b32 s1, s1, 4 3114; GFX8-NEXT: s_and_b32 s2, s4, 0xffff 3115; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3116; GFX8-NEXT: s_movrels_b32 s0, s8 3117; GFX8-NEXT: s_lshl_b32 s2, s2, s1 3118; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 3119; GFX8-NEXT: s_andn2_b32 s0, s0, s1 3120; GFX8-NEXT: s_or_b32 s0, s0, s2 3121; GFX8-NEXT: s_movreld_b32 s8, s0 3122; GFX8-NEXT: v_mov_b32_e32 v4, 0 3123; GFX8-NEXT: v_mov_b32_e32 v0, s8 3124; GFX8-NEXT: v_mov_b32_e32 v5, 0 3125; GFX8-NEXT: v_mov_b32_e32 v1, s9 3126; GFX8-NEXT: v_mov_b32_e32 v2, s10 3127; GFX8-NEXT: v_mov_b32_e32 v3, s11 3128; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3129; GFX8-NEXT: v_mov_b32_e32 v4, 16 3130; GFX8-NEXT: v_mov_b32_e32 v0, s12 3131; GFX8-NEXT: v_mov_b32_e32 v5, 0 3132; GFX8-NEXT: v_mov_b32_e32 v1, s13 3133; GFX8-NEXT: v_mov_b32_e32 v2, s14 3134; GFX8-NEXT: v_mov_b32_e32 v3, s15 3135; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3136; GFX8-NEXT: s_endpgm 3137; 3138; GFX7-LABEL: insertelement_s_v16i16_s_s: 3139; GFX7: ; %bb.0: 3140; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3141; GFX7-NEXT: s_and_b32 s1, s5, 1 3142; GFX7-NEXT: s_lshr_b32 m0, s5, 1 3143; GFX7-NEXT: s_lshl_b32 s1, s1, 4 3144; GFX7-NEXT: s_and_b32 s2, s4, 0xffff 3145; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3146; GFX7-NEXT: s_movrels_b32 s0, s8 3147; GFX7-NEXT: s_lshl_b32 s2, s2, s1 3148; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 3149; GFX7-NEXT: s_andn2_b32 s0, s0, s1 3150; GFX7-NEXT: s_or_b32 s0, s0, s2 3151; GFX7-NEXT: s_movreld_b32 s8, s0 3152; GFX7-NEXT: v_mov_b32_e32 v0, s8 3153; GFX7-NEXT: s_mov_b64 s[0:1], 0 3154; GFX7-NEXT: v_mov_b32_e32 v1, s9 3155; GFX7-NEXT: v_mov_b32_e32 v2, s10 3156; GFX7-NEXT: v_mov_b32_e32 v3, s11 3157; GFX7-NEXT: s_mov_b32 s2, -1 3158; GFX7-NEXT: s_mov_b32 s3, 0xf000 3159; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3160; GFX7-NEXT: s_mov_b64 s[0:1], 16 3161; GFX7-NEXT: v_mov_b32_e32 v0, s12 3162; GFX7-NEXT: v_mov_b32_e32 v1, s13 3163; GFX7-NEXT: v_mov_b32_e32 v2, s14 3164; GFX7-NEXT: v_mov_b32_e32 v3, s15 3165; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3166; GFX7-NEXT: s_endpgm 3167; 3168; GFX10-LABEL: insertelement_s_v16i16_s_s: 3169; GFX10: ; %bb.0: 3170; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3171; GFX10-NEXT: s_and_b32 s0, s5, 1 3172; GFX10-NEXT: s_lshr_b32 m0, s5, 1 3173; GFX10-NEXT: s_lshl_b32 s0, s0, 4 3174; GFX10-NEXT: s_and_b32 s1, s4, 0xffff 3175; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 3176; GFX10-NEXT: s_lshl_b32 s0, s1, s0 3177; GFX10-NEXT: v_mov_b32_e32 v8, 0 3178; GFX10-NEXT: v_mov_b32_e32 v9, 0 3179; GFX10-NEXT: v_mov_b32_e32 v10, 16 3180; GFX10-NEXT: v_mov_b32_e32 v11, 0 3181; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3182; GFX10-NEXT: s_movrels_b32 s3, s8 3183; GFX10-NEXT: s_andn2_b32 s1, s3, s2 3184; GFX10-NEXT: s_or_b32 s0, s1, s0 3185; GFX10-NEXT: s_movreld_b32 s8, s0 3186; GFX10-NEXT: v_mov_b32_e32 v0, s8 3187; GFX10-NEXT: v_mov_b32_e32 v1, s9 3188; GFX10-NEXT: v_mov_b32_e32 v2, s10 3189; GFX10-NEXT: v_mov_b32_e32 v3, s11 3190; GFX10-NEXT: v_mov_b32_e32 v4, s12 3191; GFX10-NEXT: v_mov_b32_e32 v5, s13 3192; GFX10-NEXT: v_mov_b32_e32 v6, s14 3193; GFX10-NEXT: v_mov_b32_e32 v7, s15 3194; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3195; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 3196; GFX10-NEXT: s_endpgm 3197; 3198; GFX11-LABEL: insertelement_s_v16i16_s_s: 3199; GFX11: ; %bb.0: 3200; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 3201; GFX11-NEXT: s_and_b32 s0, s5, 1 3202; GFX11-NEXT: s_lshr_b32 m0, s5, 1 3203; GFX11-NEXT: s_lshl_b32 s0, s0, 4 3204; GFX11-NEXT: s_and_b32 s1, s4, 0xffff 3205; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 3206; GFX11-NEXT: s_lshl_b32 s0, s1, s0 3207; GFX11-NEXT: v_mov_b32_e32 v8, 0 3208; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 3209; GFX11-NEXT: v_mov_b32_e32 v11, 0 3210; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3211; GFX11-NEXT: s_movrels_b32 s3, s8 3212; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3213; GFX11-NEXT: s_and_not1_b32 s1, s3, s2 3214; GFX11-NEXT: s_or_b32 s0, s1, s0 3215; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3216; GFX11-NEXT: s_movreld_b32 s8, s0 3217; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 3218; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 3219; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 3220; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 3221; GFX11-NEXT: s_clause 0x1 3222; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 3223; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off 3224; GFX11-NEXT: s_endpgm 3225 %vec = load <16 x i16>, ptr addrspace(4) %ptr 3226 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3227 store <16 x i16> %insert, ptr addrspace(1) null 3228 ret void 3229} 3230 3231define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inreg %val, i32 inreg %idx) { 3232; GFX9-LABEL: insertelement_v_v16i16_s_s: 3233; GFX9: ; %bb.0: 3234; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 3235; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 3236; GFX9-NEXT: s_and_b32 s0, s3, 1 3237; GFX9-NEXT: s_lshr_b32 s12, s3, 1 3238; GFX9-NEXT: s_and_b32 s1, s2, 0xffff 3239; GFX9-NEXT: s_lshl_b32 s0, s0, 4 3240; GFX9-NEXT: s_lshl_b32 s1, s1, s0 3241; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 3242; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 3243; GFX9-NEXT: s_not_b32 s13, s0 3244; GFX9-NEXT: v_mov_b32_e32 v0, s1 3245; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 3246; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 3247; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 3248; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 3249; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 3250; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 3251; GFX9-NEXT: v_mov_b32_e32 v10, 0 3252; GFX9-NEXT: v_mov_b32_e32 v11, 0 3253; GFX9-NEXT: v_mov_b32_e32 v12, 16 3254; GFX9-NEXT: v_mov_b32_e32 v13, 0 3255; GFX9-NEXT: s_waitcnt vmcnt(1) 3256; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 3257; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 3258; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] 3259; GFX9-NEXT: s_waitcnt vmcnt(0) 3260; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] 3261; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] 3262; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] 3263; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3264; GFX9-NEXT: v_and_or_b32 v14, v1, s13, v0 3265; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 3266; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v14, s[12:13] 3267; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v14, vcc 3268; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[0:1] 3269; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v14, s[2:3] 3270; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v14, s[4:5] 3271; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v14, s[6:7] 3272; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v14, s[8:9] 3273; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v14, s[10:11] 3274; GFX9-NEXT: global_store_dwordx4 v[10:11], v[0:3], off 3275; GFX9-NEXT: global_store_dwordx4 v[12:13], v[4:7], off 3276; GFX9-NEXT: s_endpgm 3277; 3278; GFX8-LABEL: insertelement_v_v16i16_s_s: 3279; GFX8: ; %bb.0: 3280; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 3281; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3282; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 3283; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 3284; GFX8-NEXT: s_and_b32 s0, s3, 1 3285; GFX8-NEXT: s_and_b32 s1, s2, 0xffff 3286; GFX8-NEXT: s_lshl_b32 s0, s0, 4 3287; GFX8-NEXT: s_lshr_b32 m0, s3, 1 3288; GFX8-NEXT: s_lshl_b32 s1, s1, s0 3289; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 3290; GFX8-NEXT: s_not_b32 s0, s0 3291; GFX8-NEXT: v_mov_b32_e32 v8, 0 3292; GFX8-NEXT: v_mov_b32_e32 v9, 0 3293; GFX8-NEXT: v_mov_b32_e32 v10, 16 3294; GFX8-NEXT: v_mov_b32_e32 v11, 0 3295; GFX8-NEXT: s_waitcnt vmcnt(0) 3296; GFX8-NEXT: v_movrels_b32_e32 v12, v0 3297; GFX8-NEXT: v_and_b32_e32 v12, s0, v12 3298; GFX8-NEXT: v_or_b32_e32 v12, s1, v12 3299; GFX8-NEXT: v_movreld_b32_e32 v0, v12 3300; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3301; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3302; GFX8-NEXT: s_endpgm 3303; 3304; GFX7-LABEL: insertelement_v_v16i16_s_s: 3305; GFX7: ; %bb.0: 3306; GFX7-NEXT: s_mov_b32 s6, 0 3307; GFX7-NEXT: s_mov_b32 s7, 0xf000 3308; GFX7-NEXT: s_mov_b64 s[4:5], 0 3309; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 3310; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 3311; GFX7-NEXT: s_and_b32 s0, s3, 1 3312; GFX7-NEXT: s_and_b32 s1, s2, 0xffff 3313; GFX7-NEXT: s_lshl_b32 s0, s0, 4 3314; GFX7-NEXT: s_lshr_b32 m0, s3, 1 3315; GFX7-NEXT: s_lshl_b32 s1, s1, s0 3316; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 3317; GFX7-NEXT: s_not_b32 s0, s0 3318; GFX7-NEXT: s_mov_b64 s[4:5], 0 3319; GFX7-NEXT: s_mov_b32 s6, -1 3320; GFX7-NEXT: s_waitcnt vmcnt(0) 3321; GFX7-NEXT: v_movrels_b32_e32 v0, v2 3322; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 3323; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 3324; GFX7-NEXT: v_movreld_b32_e32 v2, v0 3325; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 3326; GFX7-NEXT: s_mov_b64 s[4:5], 16 3327; GFX7-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 3328; GFX7-NEXT: s_endpgm 3329; 3330; GFX10-LABEL: insertelement_v_v16i16_s_s: 3331; GFX10: ; %bb.0: 3332; GFX10-NEXT: s_clause 0x1 3333; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 3334; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 3335; GFX10-NEXT: s_and_b32 s0, s3, 1 3336; GFX10-NEXT: s_lshr_b32 m0, s3, 1 3337; GFX10-NEXT: s_lshl_b32 s0, s0, 4 3338; GFX10-NEXT: s_and_b32 s1, s2, 0xffff 3339; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 3340; GFX10-NEXT: s_lshl_b32 s0, s1, s0 3341; GFX10-NEXT: s_not_b32 s1, s2 3342; GFX10-NEXT: v_mov_b32_e32 v10, 16 3343; GFX10-NEXT: v_mov_b32_e32 v11, 0 3344; GFX10-NEXT: s_waitcnt vmcnt(0) 3345; GFX10-NEXT: v_movrels_b32_e32 v0, v2 3346; GFX10-NEXT: v_and_or_b32 v12, v0, s1, s0 3347; GFX10-NEXT: v_mov_b32_e32 v0, 0 3348; GFX10-NEXT: v_mov_b32_e32 v1, 0 3349; GFX10-NEXT: v_movreld_b32_e32 v2, v12 3350; GFX10-NEXT: global_store_dwordx4 v[0:1], v[2:5], off 3351; GFX10-NEXT: global_store_dwordx4 v[10:11], v[6:9], off 3352; GFX10-NEXT: s_endpgm 3353; 3354; GFX11-LABEL: insertelement_v_v16i16_s_s: 3355; GFX11: ; %bb.0: 3356; GFX11-NEXT: s_clause 0x1 3357; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off 3358; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 3359; GFX11-NEXT: s_and_b32 s0, s3, 1 3360; GFX11-NEXT: s_lshr_b32 m0, s3, 1 3361; GFX11-NEXT: s_lshl_b32 s0, s0, 4 3362; GFX11-NEXT: s_and_b32 s1, s2, 0xffff 3363; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 3364; GFX11-NEXT: s_lshl_b32 s0, s1, s0 3365; GFX11-NEXT: s_not_b32 s1, s2 3366; GFX11-NEXT: v_mov_b32_e32 v10, 16 3367; GFX11-NEXT: v_mov_b32_e32 v11, 0 3368; GFX11-NEXT: s_waitcnt vmcnt(0) 3369; GFX11-NEXT: v_movrels_b32_e32 v0, v2 3370; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 3371; GFX11-NEXT: v_and_or_b32 v12, v0, s1, s0 3372; GFX11-NEXT: v_mov_b32_e32 v0, 0 3373; GFX11-NEXT: v_mov_b32_e32 v1, 0 3374; GFX11-NEXT: v_movreld_b32_e32 v2, v12 3375; GFX11-NEXT: s_clause 0x1 3376; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off 3377; GFX11-NEXT: global_store_b128 v[10:11], v[6:9], off 3378; GFX11-NEXT: s_endpgm 3379 %vec = load <16 x i16>, ptr addrspace(1 ) %ptr 3380 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3381 store <16 x i16> %insert, ptr addrspace(1) null 3382 ret void 3383} 3384 3385define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i16 %val, i32 inreg %idx) { 3386; GFX9-LABEL: insertelement_s_v16i16_v_s: 3387; GFX9: ; %bb.0: 3388; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3389; GFX9-NEXT: s_lshr_b32 s0, s4, 1 3390; GFX9-NEXT: s_cmp_eq_u32 s0, 1 3391; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 3392; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 3393; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3394; GFX9-NEXT: s_cselect_b32 s1, s9, s8 3395; GFX9-NEXT: s_cmp_eq_u32 s0, 2 3396; GFX9-NEXT: s_cselect_b32 s1, s10, s1 3397; GFX9-NEXT: s_cmp_eq_u32 s0, 3 3398; GFX9-NEXT: s_cselect_b32 s1, s11, s1 3399; GFX9-NEXT: s_cmp_eq_u32 s0, 4 3400; GFX9-NEXT: s_cselect_b32 s1, s12, s1 3401; GFX9-NEXT: s_cmp_eq_u32 s0, 5 3402; GFX9-NEXT: s_cselect_b32 s1, s13, s1 3403; GFX9-NEXT: s_cmp_eq_u32 s0, 6 3404; GFX9-NEXT: s_cselect_b32 s1, s14, s1 3405; GFX9-NEXT: s_cmp_eq_u32 s0, 7 3406; GFX9-NEXT: s_cselect_b32 s1, s15, s1 3407; GFX9-NEXT: s_and_b32 s2, s4, 1 3408; GFX9-NEXT: s_lshl_b32 s2, s2, 4 3409; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s2 3410; GFX9-NEXT: s_andn2_b32 s1, s1, s3 3411; GFX9-NEXT: v_mov_b32_e32 v1, s1 3412; GFX9-NEXT: v_lshl_or_b32 v8, v0, s2, v1 3413; GFX9-NEXT: v_mov_b32_e32 v0, s8 3414; GFX9-NEXT: v_mov_b32_e32 v1, s9 3415; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc 3416; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 3417; GFX9-NEXT: v_mov_b32_e32 v2, s10 3418; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 3419; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 3420; GFX9-NEXT: v_mov_b32_e32 v3, s11 3421; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc 3422; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 3423; GFX9-NEXT: v_mov_b32_e32 v4, s12 3424; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc 3425; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 4 3426; GFX9-NEXT: v_mov_b32_e32 v5, s13 3427; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 3428; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 5 3429; GFX9-NEXT: v_mov_b32_e32 v6, s14 3430; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc 3431; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 6 3432; GFX9-NEXT: v_mov_b32_e32 v7, s15 3433; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 3434; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 7 3435; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 3436; GFX9-NEXT: v_mov_b32_e32 v8, 0 3437; GFX9-NEXT: v_mov_b32_e32 v9, 0 3438; GFX9-NEXT: v_mov_b32_e32 v10, 16 3439; GFX9-NEXT: v_mov_b32_e32 v11, 0 3440; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3441; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 3442; GFX9-NEXT: s_endpgm 3443; 3444; GFX8-LABEL: insertelement_s_v16i16_v_s: 3445; GFX8: ; %bb.0: 3446; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3447; GFX8-NEXT: s_and_b32 s1, s4, 1 3448; GFX8-NEXT: s_lshr_b32 m0, s4, 1 3449; GFX8-NEXT: s_lshl_b32 s1, s1, 4 3450; GFX8-NEXT: v_mov_b32_e32 v1, s1 3451; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3452; GFX8-NEXT: s_movrels_b32 s0, s8 3453; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 3454; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3455; GFX8-NEXT: s_andn2_b32 s0, s0, s1 3456; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 3457; GFX8-NEXT: v_mov_b32_e32 v0, s8 3458; GFX8-NEXT: v_mov_b32_e32 v1, s9 3459; GFX8-NEXT: v_mov_b32_e32 v2, s10 3460; GFX8-NEXT: v_mov_b32_e32 v3, s11 3461; GFX8-NEXT: v_mov_b32_e32 v4, s12 3462; GFX8-NEXT: v_mov_b32_e32 v5, s13 3463; GFX8-NEXT: v_mov_b32_e32 v6, s14 3464; GFX8-NEXT: v_mov_b32_e32 v7, s15 3465; GFX8-NEXT: v_movreld_b32_e32 v0, v8 3466; GFX8-NEXT: v_mov_b32_e32 v8, 0 3467; GFX8-NEXT: v_mov_b32_e32 v9, 0 3468; GFX8-NEXT: v_mov_b32_e32 v10, 16 3469; GFX8-NEXT: v_mov_b32_e32 v11, 0 3470; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3471; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3472; GFX8-NEXT: s_endpgm 3473; 3474; GFX7-LABEL: insertelement_s_v16i16_v_s: 3475; GFX7: ; %bb.0: 3476; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3477; GFX7-NEXT: s_and_b32 s1, s4, 1 3478; GFX7-NEXT: s_lshr_b32 m0, s4, 1 3479; GFX7-NEXT: s_lshl_b32 s1, s1, 4 3480; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 3481; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3482; GFX7-NEXT: s_movrels_b32 s0, s8 3483; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 3484; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 3485; GFX7-NEXT: s_andn2_b32 s0, s0, s1 3486; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 3487; GFX7-NEXT: v_mov_b32_e32 v0, s8 3488; GFX7-NEXT: v_mov_b32_e32 v1, s9 3489; GFX7-NEXT: v_mov_b32_e32 v2, s10 3490; GFX7-NEXT: v_mov_b32_e32 v3, s11 3491; GFX7-NEXT: v_mov_b32_e32 v4, s12 3492; GFX7-NEXT: v_mov_b32_e32 v5, s13 3493; GFX7-NEXT: v_mov_b32_e32 v6, s14 3494; GFX7-NEXT: v_mov_b32_e32 v7, s15 3495; GFX7-NEXT: s_mov_b64 s[0:1], 0 3496; GFX7-NEXT: s_mov_b32 s2, -1 3497; GFX7-NEXT: s_mov_b32 s3, 0xf000 3498; GFX7-NEXT: v_movreld_b32_e32 v0, v8 3499; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3500; GFX7-NEXT: s_mov_b64 s[0:1], 16 3501; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 3502; GFX7-NEXT: s_endpgm 3503; 3504; GFX10-LABEL: insertelement_s_v16i16_v_s: 3505; GFX10: ; %bb.0: 3506; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3507; GFX10-NEXT: s_and_b32 s0, s4, 1 3508; GFX10-NEXT: s_lshr_b32 m0, s4, 1 3509; GFX10-NEXT: s_lshl_b32 s0, s0, 4 3510; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 3511; GFX10-NEXT: s_lshl_b32 s1, 0xffff, s0 3512; GFX10-NEXT: v_mov_b32_e32 v10, 16 3513; GFX10-NEXT: v_mov_b32_e32 v11, 0 3514; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3515; GFX10-NEXT: s_movrels_b32 s2, s8 3516; GFX10-NEXT: v_mov_b32_e32 v0, s8 3517; GFX10-NEXT: s_andn2_b32 s1, s2, s1 3518; GFX10-NEXT: v_mov_b32_e32 v1, s9 3519; GFX10-NEXT: v_lshl_or_b32 v12, v8, s0, s1 3520; GFX10-NEXT: v_mov_b32_e32 v2, s10 3521; GFX10-NEXT: v_mov_b32_e32 v3, s11 3522; GFX10-NEXT: v_mov_b32_e32 v4, s12 3523; GFX10-NEXT: v_mov_b32_e32 v5, s13 3524; GFX10-NEXT: v_mov_b32_e32 v6, s14 3525; GFX10-NEXT: v_mov_b32_e32 v7, s15 3526; GFX10-NEXT: v_mov_b32_e32 v8, 0 3527; GFX10-NEXT: v_mov_b32_e32 v9, 0 3528; GFX10-NEXT: v_movreld_b32_e32 v0, v12 3529; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3530; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 3531; GFX10-NEXT: s_endpgm 3532; 3533; GFX11-LABEL: insertelement_s_v16i16_v_s: 3534; GFX11: ; %bb.0: 3535; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 3536; GFX11-NEXT: s_and_b32 s0, s4, 1 3537; GFX11-NEXT: s_lshr_b32 m0, s4, 1 3538; GFX11-NEXT: s_lshl_b32 s0, s0, 4 3539; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v0 3540; GFX11-NEXT: s_lshl_b32 s1, 0xffff, s0 3541; GFX11-NEXT: v_mov_b32_e32 v10, 16 3542; GFX11-NEXT: v_mov_b32_e32 v11, 0 3543; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3544; GFX11-NEXT: s_movrels_b32 s2, s8 3545; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 3546; GFX11-NEXT: s_and_not1_b32 s1, s2, s1 3547; GFX11-NEXT: v_mov_b32_e32 v1, s9 3548; GFX11-NEXT: v_lshl_or_b32 v12, v8, s0, s1 3549; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v5, s13 3550; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15 3551; GFX11-NEXT: v_mov_b32_e32 v6, s14 3552; GFX11-NEXT: v_mov_b32_e32 v8, 0 3553; GFX11-NEXT: v_mov_b32_e32 v9, 0 3554; GFX11-NEXT: v_movreld_b32_e32 v0, v12 3555; GFX11-NEXT: s_clause 0x1 3556; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 3557; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off 3558; GFX11-NEXT: s_endpgm 3559 %vec = load <16 x i16>, ptr addrspace(4) %ptr 3560 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3561 store <16 x i16> %insert, ptr addrspace(1) null 3562 ret void 3563} 3564 3565define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 %idx) { 3566; GFX9-LABEL: insertelement_s_v16i16_s_v: 3567; GFX9: ; %bb.0: 3568; GFX9-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 3569; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v0 3570; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3571; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3572; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3574; GFX9-NEXT: v_mov_b32_e32 v1, s16 3575; GFX9-NEXT: v_mov_b32_e32 v2, s17 3576; GFX9-NEXT: v_mov_b32_e32 v3, s18 3577; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3578; GFX9-NEXT: v_mov_b32_e32 v4, s19 3579; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 3580; GFX9-NEXT: v_mov_b32_e32 v5, s20 3581; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 3582; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 3583; GFX9-NEXT: v_mov_b32_e32 v6, s21 3584; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] 3585; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3586; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 3587; GFX9-NEXT: v_mov_b32_e32 v7, s22 3588; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 3589; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3590; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3591; GFX9-NEXT: s_and_b32 s4, s4, 0xffff 3592; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 3593; GFX9-NEXT: v_mov_b32_e32 v9, s23 3594; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] 3595; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3596; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 3597; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v3 3598; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3599; GFX9-NEXT: v_not_b32_e32 v0, v0 3600; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 3601; GFX9-NEXT: v_mov_b32_e32 v0, s16 3602; GFX9-NEXT: v_mov_b32_e32 v1, s17 3603; GFX9-NEXT: v_mov_b32_e32 v2, s18 3604; GFX9-NEXT: v_mov_b32_e32 v3, s19 3605; GFX9-NEXT: v_mov_b32_e32 v4, s20 3606; GFX9-NEXT: v_mov_b32_e32 v5, s21 3607; GFX9-NEXT: v_mov_b32_e32 v6, s22 3608; GFX9-NEXT: v_mov_b32_e32 v7, s23 3609; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3610; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3611; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3612; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3613; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3614; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] 3615; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3616; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3617; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3618; GFX9-NEXT: v_mov_b32_e32 v8, 0 3619; GFX9-NEXT: v_mov_b32_e32 v9, 0 3620; GFX9-NEXT: v_mov_b32_e32 v10, 16 3621; GFX9-NEXT: v_mov_b32_e32 v11, 0 3622; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3623; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 3624; GFX9-NEXT: s_endpgm 3625; 3626; GFX8-LABEL: insertelement_s_v16i16_s_v: 3627; GFX8: ; %bb.0: 3628; GFX8-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 3629; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v0 3630; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3631; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3632; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3633; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3634; GFX8-NEXT: v_mov_b32_e32 v1, s16 3635; GFX8-NEXT: v_mov_b32_e32 v2, s17 3636; GFX8-NEXT: v_mov_b32_e32 v3, s18 3637; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3638; GFX8-NEXT: v_mov_b32_e32 v4, s19 3639; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 3640; GFX8-NEXT: v_mov_b32_e32 v5, s20 3641; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 3642; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 3643; GFX8-NEXT: v_mov_b32_e32 v6, s21 3644; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] 3645; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3646; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 3647; GFX8-NEXT: v_mov_b32_e32 v7, s22 3648; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 3649; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3650; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3651; GFX8-NEXT: s_and_b32 s4, s4, 0xffff 3652; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff 3653; GFX8-NEXT: v_mov_b32_e32 v9, s23 3654; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] 3655; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3656; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 3657; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 3658; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3659; GFX8-NEXT: v_not_b32_e32 v0, v0 3660; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 3661; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 3662; GFX8-NEXT: v_mov_b32_e32 v0, s16 3663; GFX8-NEXT: v_mov_b32_e32 v1, s17 3664; GFX8-NEXT: v_mov_b32_e32 v2, s18 3665; GFX8-NEXT: v_mov_b32_e32 v3, s19 3666; GFX8-NEXT: v_mov_b32_e32 v4, s20 3667; GFX8-NEXT: v_mov_b32_e32 v5, s21 3668; GFX8-NEXT: v_mov_b32_e32 v6, s22 3669; GFX8-NEXT: v_mov_b32_e32 v7, s23 3670; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3671; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3672; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3673; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3674; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3675; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] 3676; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3677; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3678; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3679; GFX8-NEXT: v_mov_b32_e32 v8, 0 3680; GFX8-NEXT: v_mov_b32_e32 v9, 0 3681; GFX8-NEXT: v_mov_b32_e32 v10, 16 3682; GFX8-NEXT: v_mov_b32_e32 v11, 0 3683; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3684; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3685; GFX8-NEXT: s_endpgm 3686; 3687; GFX7-LABEL: insertelement_s_v16i16_s_v: 3688; GFX7: ; %bb.0: 3689; GFX7-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 3690; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v0 3691; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3692; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3693; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3694; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3695; GFX7-NEXT: v_mov_b32_e32 v1, s16 3696; GFX7-NEXT: v_mov_b32_e32 v2, s17 3697; GFX7-NEXT: v_mov_b32_e32 v3, s18 3698; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3699; GFX7-NEXT: v_mov_b32_e32 v4, s19 3700; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 3701; GFX7-NEXT: v_mov_b32_e32 v5, s20 3702; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 3703; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 3704; GFX7-NEXT: v_mov_b32_e32 v6, s21 3705; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] 3706; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3707; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 3708; GFX7-NEXT: v_mov_b32_e32 v7, s22 3709; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 3710; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3711; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3712; GFX7-NEXT: s_and_b32 s4, s4, 0xffff 3713; GFX7-NEXT: v_mov_b32_e32 v9, s23 3714; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] 3715; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3716; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 3717; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 3718; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] 3719; GFX7-NEXT: v_not_b32_e32 v0, v0 3720; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 3721; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 3722; GFX7-NEXT: v_mov_b32_e32 v0, s16 3723; GFX7-NEXT: v_mov_b32_e32 v1, s17 3724; GFX7-NEXT: v_mov_b32_e32 v2, s18 3725; GFX7-NEXT: v_mov_b32_e32 v3, s19 3726; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3727; GFX7-NEXT: v_mov_b32_e32 v4, s20 3728; GFX7-NEXT: v_mov_b32_e32 v5, s21 3729; GFX7-NEXT: v_mov_b32_e32 v6, s22 3730; GFX7-NEXT: v_mov_b32_e32 v7, s23 3731; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3732; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3733; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3734; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3735; GFX7-NEXT: s_mov_b64 s[0:1], 0 3736; GFX7-NEXT: s_mov_b32 s2, -1 3737; GFX7-NEXT: s_mov_b32 s3, 0xf000 3738; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] 3739; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3740; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3741; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3742; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3743; GFX7-NEXT: s_mov_b64 s[0:1], 16 3744; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 3745; GFX7-NEXT: s_endpgm 3746; 3747; GFX10-LABEL: insertelement_s_v16i16_s_v: 3748; GFX10: ; %bb.0: 3749; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 3750; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0 3751; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 3752; GFX10-NEXT: s_and_b32 s5, s4, 0xffff 3753; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 3754; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 3755; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 3756; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 3757; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 3758; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3759; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 3760; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 3761; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff 3762; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s5 3763; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 3764; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3765; GFX10-NEXT: v_mov_b32_e32 v1, s9 3766; GFX10-NEXT: v_not_b32_e32 v9, v2 3767; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo 3768; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 3769; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 3770; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 3771; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 3772; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 3773; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 3774; GFX10-NEXT: v_mov_b32_e32 v0, s8 3775; GFX10-NEXT: v_mov_b32_e32 v1, s9 3776; GFX10-NEXT: v_mov_b32_e32 v2, s10 3777; GFX10-NEXT: v_mov_b32_e32 v3, s11 3778; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 3779; GFX10-NEXT: v_mov_b32_e32 v4, s12 3780; GFX10-NEXT: v_mov_b32_e32 v5, s13 3781; GFX10-NEXT: v_mov_b32_e32 v6, s14 3782; GFX10-NEXT: v_mov_b32_e32 v7, s15 3783; GFX10-NEXT: v_mov_b32_e32 v8, 0 3784; GFX10-NEXT: v_mov_b32_e32 v9, 0 3785; GFX10-NEXT: v_mov_b32_e32 v10, 16 3786; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 3787; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo 3788; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 3789; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 3790; GFX10-NEXT: v_mov_b32_e32 v11, 0 3791; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 3792; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 3793; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 3794; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 3795; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3796; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 3797; GFX10-NEXT: s_endpgm 3798; 3799; GFX11-LABEL: insertelement_s_v16i16_s_v: 3800; GFX11: ; %bb.0: 3801; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 3802; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v0 3803; GFX11-NEXT: s_and_b32 s5, s4, 0xffff 3804; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 3805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 3806; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 3807; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 3808; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 3809; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 3810; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 3811; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 3812; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 3813; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 3814; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3815; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff 3816; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, s5 3817; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3818; GFX11-NEXT: v_mov_b32_e32 v1, s9 3819; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 3820; GFX11-NEXT: v_not_b32_e32 v9, v2 3821; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3822; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo 3823; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 3824; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3825; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 3826; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 3827; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3828; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 3829; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 3830; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 3831; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 3832; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 3833; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 3834; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 3835; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 3836; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 3837; GFX11-NEXT: v_mov_b32_e32 v8, 0 3838; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 3839; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 3840; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo 3841; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 3842; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 3843; GFX11-NEXT: v_mov_b32_e32 v11, 0 3844; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 3845; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 3846; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 3847; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 3848; GFX11-NEXT: s_clause 0x1 3849; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 3850; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off 3851; GFX11-NEXT: s_endpgm 3852 %vec = load <16 x i16>, ptr addrspace(4) %ptr 3853 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 3854 store <16 x i16> %insert, ptr addrspace(1) null 3855 ret void 3856} 3857 3858define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i16 %val, i32 %idx) { 3859; GFX9-LABEL: insertelement_s_v16i16_v_v: 3860; GFX9: ; %bb.0: 3861; GFX9-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 3862; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v1 3863; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3864; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3865; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3866; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3867; GFX9-NEXT: v_mov_b32_e32 v2, s12 3868; GFX9-NEXT: v_mov_b32_e32 v3, s13 3869; GFX9-NEXT: v_mov_b32_e32 v4, s14 3870; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3871; GFX9-NEXT: v_mov_b32_e32 v5, s15 3872; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 3873; GFX9-NEXT: v_mov_b32_e32 v6, s16 3874; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 3875; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 3876; GFX9-NEXT: v_mov_b32_e32 v7, s17 3877; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 3878; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3879; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 3880; GFX9-NEXT: v_mov_b32_e32 v9, s18 3881; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] 3882; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3883; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3884; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 3885; GFX9-NEXT: v_mov_b32_e32 v10, s19 3886; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 3887; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3888; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3889; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 3890; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 3891; GFX9-NEXT: v_not_b32_e32 v1, v1 3892; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 3893; GFX9-NEXT: v_mov_b32_e32 v0, s12 3894; GFX9-NEXT: v_mov_b32_e32 v1, s13 3895; GFX9-NEXT: v_mov_b32_e32 v2, s14 3896; GFX9-NEXT: v_mov_b32_e32 v3, s15 3897; GFX9-NEXT: v_mov_b32_e32 v4, s16 3898; GFX9-NEXT: v_mov_b32_e32 v5, s17 3899; GFX9-NEXT: v_mov_b32_e32 v6, s18 3900; GFX9-NEXT: v_mov_b32_e32 v7, s19 3901; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3902; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3903; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3904; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3905; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3906; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] 3907; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3908; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3909; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3910; GFX9-NEXT: v_mov_b32_e32 v8, 0 3911; GFX9-NEXT: v_mov_b32_e32 v9, 0 3912; GFX9-NEXT: v_mov_b32_e32 v10, 16 3913; GFX9-NEXT: v_mov_b32_e32 v11, 0 3914; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3915; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 3916; GFX9-NEXT: s_endpgm 3917; 3918; GFX8-LABEL: insertelement_s_v16i16_v_v: 3919; GFX8: ; %bb.0: 3920; GFX8-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 3921; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v1 3922; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3923; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3924; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3925; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3926; GFX8-NEXT: v_mov_b32_e32 v2, s12 3927; GFX8-NEXT: v_mov_b32_e32 v3, s13 3928; GFX8-NEXT: v_mov_b32_e32 v4, s14 3929; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3930; GFX8-NEXT: v_mov_b32_e32 v5, s15 3931; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 3932; GFX8-NEXT: v_mov_b32_e32 v6, s16 3933; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 3934; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 3935; GFX8-NEXT: v_mov_b32_e32 v7, s17 3936; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 3937; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3938; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 3939; GFX8-NEXT: v_mov_b32_e32 v9, s18 3940; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] 3941; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 3942; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 3943; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff 3944; GFX8-NEXT: v_mov_b32_e32 v10, s19 3945; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 3946; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 3947; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3948; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 3949; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 3950; GFX8-NEXT: v_not_b32_e32 v1, v1 3951; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 3952; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 3953; GFX8-NEXT: v_mov_b32_e32 v0, s12 3954; GFX8-NEXT: v_mov_b32_e32 v1, s13 3955; GFX8-NEXT: v_mov_b32_e32 v2, s14 3956; GFX8-NEXT: v_mov_b32_e32 v3, s15 3957; GFX8-NEXT: v_mov_b32_e32 v4, s16 3958; GFX8-NEXT: v_mov_b32_e32 v5, s17 3959; GFX8-NEXT: v_mov_b32_e32 v6, s18 3960; GFX8-NEXT: v_mov_b32_e32 v7, s19 3961; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 3962; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 3963; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 3964; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 3965; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 3966; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] 3967; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 3968; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 3969; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 3970; GFX8-NEXT: v_mov_b32_e32 v8, 0 3971; GFX8-NEXT: v_mov_b32_e32 v9, 0 3972; GFX8-NEXT: v_mov_b32_e32 v10, 16 3973; GFX8-NEXT: v_mov_b32_e32 v11, 0 3974; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3975; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 3976; GFX8-NEXT: s_endpgm 3977; 3978; GFX7-LABEL: insertelement_s_v16i16_v_v: 3979; GFX7: ; %bb.0: 3980; GFX7-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 3981; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v1 3982; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 3983; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 3984; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 3985; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3986; GFX7-NEXT: v_mov_b32_e32 v2, s12 3987; GFX7-NEXT: v_mov_b32_e32 v3, s13 3988; GFX7-NEXT: v_mov_b32_e32 v4, s14 3989; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3990; GFX7-NEXT: v_mov_b32_e32 v5, s15 3991; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 3992; GFX7-NEXT: v_mov_b32_e32 v6, s16 3993; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 3994; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 3995; GFX7-NEXT: v_mov_b32_e32 v7, s17 3996; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 3997; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 3998; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 3999; GFX7-NEXT: v_mov_b32_e32 v9, s18 4000; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] 4001; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 4002; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4003; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 4004; GFX7-NEXT: v_mov_b32_e32 v10, s19 4005; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 4006; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 4007; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 4008; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 4009; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 4010; GFX7-NEXT: v_not_b32_e32 v1, v1 4011; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 4012; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 4013; GFX7-NEXT: v_mov_b32_e32 v0, s12 4014; GFX7-NEXT: v_mov_b32_e32 v1, s13 4015; GFX7-NEXT: v_mov_b32_e32 v2, s14 4016; GFX7-NEXT: v_mov_b32_e32 v3, s15 4017; GFX7-NEXT: v_mov_b32_e32 v4, s16 4018; GFX7-NEXT: v_mov_b32_e32 v5, s17 4019; GFX7-NEXT: v_mov_b32_e32 v6, s18 4020; GFX7-NEXT: v_mov_b32_e32 v7, s19 4021; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 4022; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] 4023; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 4024; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] 4025; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] 4026; GFX7-NEXT: s_mov_b64 s[0:1], 0 4027; GFX7-NEXT: s_mov_b32 s2, -1 4028; GFX7-NEXT: s_mov_b32 s3, 0xf000 4029; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] 4030; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] 4031; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] 4032; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] 4033; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4034; GFX7-NEXT: s_mov_b64 s[0:1], 16 4035; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 4036; GFX7-NEXT: s_endpgm 4037; 4038; GFX10-LABEL: insertelement_s_v16i16_v_v: 4039; GFX10: ; %bb.0: 4040; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 4041; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1 4042; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 4043; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 4044; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 4045; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 4046; GFX10-NEXT: s_mov_b32 null, 0 4047; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 4048; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 4049; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4050; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 4051; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 4052; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 4053; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff 4054; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4055; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4056; GFX10-NEXT: v_mov_b32_e32 v2, s9 4057; GFX10-NEXT: v_not_b32_e32 v9, v3 4058; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo 4059; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 4060; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 4061; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 4062; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 4063; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 4064; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 4065; GFX10-NEXT: v_mov_b32_e32 v0, s8 4066; GFX10-NEXT: v_mov_b32_e32 v1, s9 4067; GFX10-NEXT: v_mov_b32_e32 v2, s10 4068; GFX10-NEXT: v_mov_b32_e32 v3, s11 4069; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 4070; GFX10-NEXT: v_mov_b32_e32 v4, s12 4071; GFX10-NEXT: v_mov_b32_e32 v5, s13 4072; GFX10-NEXT: v_mov_b32_e32 v6, s14 4073; GFX10-NEXT: v_mov_b32_e32 v7, s15 4074; GFX10-NEXT: v_mov_b32_e32 v8, 0 4075; GFX10-NEXT: v_mov_b32_e32 v9, 0 4076; GFX10-NEXT: v_mov_b32_e32 v10, 16 4077; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 4078; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo 4079; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 4080; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 4081; GFX10-NEXT: v_mov_b32_e32 v11, 0 4082; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 4083; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 4084; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 4085; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 4086; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 4087; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off 4088; GFX10-NEXT: s_endpgm 4089; 4090; GFX11-LABEL: insertelement_s_v16i16_v_v: 4091; GFX11: ; %bb.0: 4092; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 4093; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v1 4094; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 4095; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 4096; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 4097; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 4098; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 4099; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 4100; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 4101; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 4102; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 4103; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 4104; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 4105; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4106; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 4107; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff 4108; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4109; GFX11-NEXT: v_mov_b32_e32 v2, s9 4110; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 4111; GFX11-NEXT: v_not_b32_e32 v9, v3 4112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 4113; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo 4114; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 4115; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4116; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 4117; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 4118; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4119; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 4120; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 4121; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4122; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 4123; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 4124; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 4125; GFX11-NEXT: v_mov_b32_e32 v7, s15 4126; GFX11-NEXT: v_mov_b32_e32 v3, s11 4127; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 4128; GFX11-NEXT: v_mov_b32_e32 v4, s12 4129; GFX11-NEXT: v_mov_b32_e32 v6, s14 4130; GFX11-NEXT: v_mov_b32_e32 v8, 0 4131; GFX11-NEXT: v_mov_b32_e32 v9, 0 4132; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v13 :: v_dual_mov_b32 v10, 16 4133; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 4134; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 4135; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 4136; GFX11-NEXT: v_mov_b32_e32 v11, 0 4137; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 4138; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 4139; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 4140; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 4141; GFX11-NEXT: s_clause 0x1 4142; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 4143; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off 4144; GFX11-NEXT: s_endpgm 4145 %vec = load <16 x i16>, ptr addrspace(4) %ptr 4146 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 4147 store <16 x i16> %insert, ptr addrspace(1) null 4148 ret void 4149} 4150 4151define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inreg %val, i32 %idx) { 4152; GFX9-LABEL: insertelement_v_v16i16_s_v: 4153; GFX9: ; %bb.0: 4154; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 4155; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 4156; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v2 4157; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 4158; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 4159; GFX9-NEXT: s_and_b32 s0, s2, 0xffff 4160; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 4161; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 4162; GFX9-NEXT: v_lshlrev_b32_e64 v15, v2, s0 4163; GFX9-NEXT: v_lshlrev_b32_e32 v0, v2, v0 4164; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 4165; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 4166; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 4167; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 4168; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 4169; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 4170; GFX9-NEXT: v_not_b32_e32 v0, v0 4171; GFX9-NEXT: v_mov_b32_e32 v11, 0 4172; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 4173; GFX9-NEXT: v_mov_b32_e32 v12, 0 4174; GFX9-NEXT: v_mov_b32_e32 v13, 16 4175; GFX9-NEXT: v_mov_b32_e32 v14, 0 4176; GFX9-NEXT: s_waitcnt vmcnt(1) 4177; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 4178; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 4179; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 4180; GFX9-NEXT: s_waitcnt vmcnt(0) 4181; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] 4182; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] 4183; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 4184; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 4185; GFX9-NEXT: v_and_or_b32 v15, v2, v0, v15 4186; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13] 4187; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc 4188; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v15, s[0:1] 4189; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[2:3] 4190; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v15, s[4:5] 4191; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v15, s[6:7] 4192; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v15, s[8:9] 4193; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v15, s[10:11] 4194; GFX9-NEXT: global_store_dwordx4 v[11:12], v[0:3], off 4195; GFX9-NEXT: global_store_dwordx4 v[13:14], v[4:7], off 4196; GFX9-NEXT: s_endpgm 4197; 4198; GFX8-LABEL: insertelement_v_v16i16_s_v: 4199; GFX8: ; %bb.0: 4200; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 4201; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 4202; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4203; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] 4204; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v2 4205; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 4206; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff 4207; GFX8-NEXT: s_and_b32 s0, s2, 0xffff 4208; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 4209; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 4210; GFX8-NEXT: v_lshlrev_b32_e64 v15, v2, s0 4211; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 4212; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 4213; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 4214; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 4215; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 4216; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 4217; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 4218; GFX8-NEXT: v_not_b32_e32 v0, v0 4219; GFX8-NEXT: v_mov_b32_e32 v11, 0 4220; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 4221; GFX8-NEXT: v_mov_b32_e32 v12, 0 4222; GFX8-NEXT: v_mov_b32_e32 v13, 16 4223; GFX8-NEXT: v_mov_b32_e32 v14, 0 4224; GFX8-NEXT: s_waitcnt vmcnt(1) 4225; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 4226; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 4227; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] 4228; GFX8-NEXT: s_waitcnt vmcnt(0) 4229; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] 4230; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] 4231; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] 4232; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] 4233; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 4234; GFX8-NEXT: v_or_b32_e32 v15, v0, v15 4235; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13] 4236; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc 4237; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v15, s[0:1] 4238; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[2:3] 4239; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v15, s[4:5] 4240; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v15, s[6:7] 4241; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v15, s[8:9] 4242; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v15, s[10:11] 4243; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3] 4244; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[4:7] 4245; GFX8-NEXT: s_endpgm 4246; 4247; GFX7-LABEL: insertelement_v_v16i16_s_v: 4248; GFX7: ; %bb.0: 4249; GFX7-NEXT: s_mov_b32 s18, 0 4250; GFX7-NEXT: s_mov_b32 s19, 0xf000 4251; GFX7-NEXT: s_mov_b64 s[16:17], 0 4252; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 4253; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 4254; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 4255; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 4256; GFX7-NEXT: s_and_b32 s0, s2, 0xffff 4257; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4258; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 4259; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 4260; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 4261; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 4262; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 4263; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 4264; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 4265; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 4266; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 4267; GFX7-NEXT: v_not_b32_e32 v1, v1 4268; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 4269; GFX7-NEXT: s_mov_b64 s[16:17], 0 4270; GFX7-NEXT: s_mov_b32 s18, -1 4271; GFX7-NEXT: s_waitcnt vmcnt(1) 4272; GFX7-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc 4273; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] 4274; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] 4275; GFX7-NEXT: s_waitcnt vmcnt(0) 4276; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] 4277; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] 4278; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] 4279; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] 4280; GFX7-NEXT: v_and_b32_e32 v1, v11, v1 4281; GFX7-NEXT: v_or_b32_e32 v11, v1, v2 4282; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] 4283; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc 4284; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] 4285; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] 4286; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] 4287; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] 4288; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] 4289; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] 4290; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 4291; GFX7-NEXT: s_mov_b64 s[16:17], 16 4292; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 4293; GFX7-NEXT: s_endpgm 4294; 4295; GFX10-LABEL: insertelement_v_v16i16_s_v: 4296; GFX10: ; %bb.0: 4297; GFX10-NEXT: s_clause 0x1 4298; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 4299; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 4300; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 4301; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 4302; GFX10-NEXT: s_and_b32 s5, s2, 0xffff 4303; GFX10-NEXT: v_mov_b32_e32 v13, 16 4304; GFX10-NEXT: v_mov_b32_e32 v14, 0 4305; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 4306; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 4307; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 4308; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 4309; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 4310; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 4311; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 4312; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 4313; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff 4314; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 4315; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 4316; GFX10-NEXT: v_not_b32_e32 v11, v11 4317; GFX10-NEXT: s_waitcnt vmcnt(1) 4318; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo 4319; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 4320; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 4321; GFX10-NEXT: s_waitcnt vmcnt(0) 4322; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 4323; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 4324; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 4325; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 4326; GFX10-NEXT: v_and_or_b32 v15, v1, v11, v2 4327; GFX10-NEXT: v_mov_b32_e32 v11, 0 4328; GFX10-NEXT: v_mov_b32_e32 v12, 0 4329; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 4330; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo 4331; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 4332; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 4333; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 4334; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 4335; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 4336; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 4337; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off 4338; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off 4339; GFX10-NEXT: s_endpgm 4340; 4341; GFX11-LABEL: insertelement_v_v16i16_s_v: 4342; GFX11: ; %bb.0: 4343; GFX11-NEXT: s_clause 0x1 4344; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off 4345; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 4346; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 4347; GFX11-NEXT: s_and_b32 s5, s2, 0xffff 4348; GFX11-NEXT: v_dual_mov_b32 v13, 16 :: v_dual_and_b32 v2, 1, v2 4349; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 4350; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 4351; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 4352; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 4353; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 4354; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 4355; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 4356; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 4357; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 4358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4359; GFX11-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff 4360; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, s5 4361; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 4362; GFX11-NEXT: v_not_b32_e32 v11, v11 4363; GFX11-NEXT: s_waitcnt vmcnt(1) 4364; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo 4365; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4366; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 4367; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 4368; GFX11-NEXT: s_waitcnt vmcnt(0) 4369; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4370; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 4371; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 4372; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4373; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 4374; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 4375; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4376; GFX11-NEXT: v_and_or_b32 v15, v1, v11, v2 4377; GFX11-NEXT: v_mov_b32_e32 v11, 0 4378; GFX11-NEXT: v_mov_b32_e32 v12, 0 4379; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_cndmask_b32 v1, v4, v15 4380; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 4381; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 4382; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 4383; GFX11-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 4384; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 4385; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 4386; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 4387; GFX11-NEXT: s_clause 0x1 4388; GFX11-NEXT: global_store_b128 v[11:12], v[0:3], off 4389; GFX11-NEXT: global_store_b128 v[13:14], v[4:7], off 4390; GFX11-NEXT: s_endpgm 4391 %vec = load <16 x i16>, ptr addrspace(1) %ptr 4392 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 4393 store <16 x i16> %insert, ptr addrspace(1) null 4394 ret void 4395} 4396 4397define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %val, i32 inreg %idx) { 4398; GFX9-LABEL: insertelement_v_v16i16_v_s: 4399; GFX9: ; %bb.0: 4400; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 4401; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 4402; GFX9-NEXT: s_and_b32 s0, s2, 1 4403; GFX9-NEXT: s_lshr_b32 s12, s2, 1 4404; GFX9-NEXT: s_lshl_b32 s0, s0, 4 4405; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4406; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 4407; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 4408; GFX9-NEXT: s_not_b32 s13, s0 4409; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 4410; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 4411; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 4412; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 4413; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 4414; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 4415; GFX9-NEXT: v_mov_b32_e32 v11, 0 4416; GFX9-NEXT: v_mov_b32_e32 v12, 0 4417; GFX9-NEXT: v_mov_b32_e32 v13, 16 4418; GFX9-NEXT: v_mov_b32_e32 v14, 0 4419; GFX9-NEXT: s_waitcnt vmcnt(1) 4420; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 4421; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 4422; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] 4423; GFX9-NEXT: s_waitcnt vmcnt(0) 4424; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 4425; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] 4426; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] 4427; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] 4428; GFX9-NEXT: v_and_or_b32 v15, v1, s13, v0 4429; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 4430; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13] 4431; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc 4432; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v15, s[0:1] 4433; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[2:3] 4434; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v15, s[4:5] 4435; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v15, s[6:7] 4436; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v15, s[8:9] 4437; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v15, s[10:11] 4438; GFX9-NEXT: global_store_dwordx4 v[11:12], v[0:3], off 4439; GFX9-NEXT: global_store_dwordx4 v[13:14], v[4:7], off 4440; GFX9-NEXT: s_endpgm 4441; 4442; GFX8-LABEL: insertelement_v_v16i16_v_s: 4443; GFX8: ; %bb.0: 4444; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 4445; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 4446; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] 4447; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] 4448; GFX8-NEXT: s_and_b32 s0, s2, 1 4449; GFX8-NEXT: s_lshl_b32 s0, s0, 4 4450; GFX8-NEXT: s_lshr_b32 m0, s2, 1 4451; GFX8-NEXT: v_mov_b32_e32 v13, s0 4452; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 4453; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4454; GFX8-NEXT: s_not_b32 s0, s0 4455; GFX8-NEXT: v_mov_b32_e32 v0, 0 4456; GFX8-NEXT: v_mov_b32_e32 v1, 0 4457; GFX8-NEXT: v_mov_b32_e32 v11, 16 4458; GFX8-NEXT: v_mov_b32_e32 v12, 0 4459; GFX8-NEXT: s_waitcnt vmcnt(0) 4460; GFX8-NEXT: v_movrels_b32_e32 v13, v3 4461; GFX8-NEXT: v_and_b32_e32 v13, s0, v13 4462; GFX8-NEXT: v_or_b32_e32 v2, v13, v2 4463; GFX8-NEXT: v_movreld_b32_e32 v3, v2 4464; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6] 4465; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] 4466; GFX8-NEXT: s_endpgm 4467; 4468; GFX7-LABEL: insertelement_v_v16i16_v_s: 4469; GFX7: ; %bb.0: 4470; GFX7-NEXT: s_mov_b32 s6, 0 4471; GFX7-NEXT: s_mov_b32 s7, 0xf000 4472; GFX7-NEXT: s_mov_b64 s[4:5], 0 4473; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 4474; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16 4475; GFX7-NEXT: s_and_b32 s0, s2, 1 4476; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 4477; GFX7-NEXT: s_lshl_b32 s0, s0, 4 4478; GFX7-NEXT: s_lshr_b32 m0, s2, 1 4479; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 4480; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 4481; GFX7-NEXT: s_not_b32 s0, s0 4482; GFX7-NEXT: s_mov_b64 s[4:5], 0 4483; GFX7-NEXT: s_mov_b32 s6, -1 4484; GFX7-NEXT: s_waitcnt vmcnt(0) 4485; GFX7-NEXT: v_movrels_b32_e32 v1, v3 4486; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 4487; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 4488; GFX7-NEXT: v_movreld_b32_e32 v3, v0 4489; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 4490; GFX7-NEXT: s_mov_b64 s[4:5], 16 4491; GFX7-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 4492; GFX7-NEXT: s_endpgm 4493; 4494; GFX10-LABEL: insertelement_v_v16i16_v_s: 4495; GFX10: ; %bb.0: 4496; GFX10-NEXT: s_clause 0x1 4497; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off 4498; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 4499; GFX10-NEXT: s_and_b32 s0, s2, 1 4500; GFX10-NEXT: s_lshr_b32 m0, s2, 1 4501; GFX10-NEXT: s_lshl_b32 s0, s0, 4 4502; GFX10-NEXT: v_mov_b32_e32 v11, 16 4503; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4504; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 4505; GFX10-NEXT: v_mov_b32_e32 v12, 0 4506; GFX10-NEXT: s_not_b32 s0, s0 4507; GFX10-NEXT: s_waitcnt vmcnt(0) 4508; GFX10-NEXT: v_movrels_b32_e32 v1, v3 4509; GFX10-NEXT: v_and_or_b32 v2, v1, s0, v0 4510; GFX10-NEXT: v_mov_b32_e32 v0, 0 4511; GFX10-NEXT: v_mov_b32_e32 v1, 0 4512; GFX10-NEXT: v_movreld_b32_e32 v3, v2 4513; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off 4514; GFX10-NEXT: global_store_dwordx4 v[11:12], v[7:10], off 4515; GFX10-NEXT: s_endpgm 4516; 4517; GFX11-LABEL: insertelement_v_v16i16_v_s: 4518; GFX11: ; %bb.0: 4519; GFX11-NEXT: s_clause 0x1 4520; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off 4521; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 4522; GFX11-NEXT: v_dual_mov_b32 v11, 16 :: v_dual_and_b32 v0, 0xffff, v2 4523; GFX11-NEXT: s_and_b32 s0, s2, 1 4524; GFX11-NEXT: s_lshr_b32 m0, s2, 1 4525; GFX11-NEXT: s_lshl_b32 s0, s0, 4 4526; GFX11-NEXT: v_mov_b32_e32 v12, 0 4527; GFX11-NEXT: v_lshlrev_b32_e32 v0, s0, v0 4528; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 4529; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 4530; GFX11-NEXT: s_not_b32 s0, s0 4531; GFX11-NEXT: s_waitcnt vmcnt(0) 4532; GFX11-NEXT: v_movrels_b32_e32 v1, v3 4533; GFX11-NEXT: v_and_or_b32 v2, v1, s0, v0 4534; GFX11-NEXT: v_mov_b32_e32 v0, 0 4535; GFX11-NEXT: v_mov_b32_e32 v1, 0 4536; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 4537; GFX11-NEXT: v_movreld_b32_e32 v3, v2 4538; GFX11-NEXT: s_clause 0x1 4539; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off 4540; GFX11-NEXT: global_store_b128 v[11:12], v[7:10], off 4541; GFX11-NEXT: s_endpgm 4542 %vec = load <16 x i16>, ptr addrspace(1) %ptr 4543 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 4544 store <16 x i16> %insert, ptr addrspace(1) null 4545 ret void 4546} 4547 4548define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %val, i32 %idx) { 4549; GFX9-LABEL: insertelement_v_v16i16_v_v: 4550; GFX9: ; %bb.0: 4551; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 4552; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 4553; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v3 4554; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 4555; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 4556; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 4557; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 4558; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4559; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 4560; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 4561; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 4562; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 4563; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 4564; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 4565; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 4566; GFX9-NEXT: v_not_b32_e32 v0, v0 4567; GFX9-NEXT: v_mov_b32_e32 v12, 0 4568; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 4569; GFX9-NEXT: v_mov_b32_e32 v13, 0 4570; GFX9-NEXT: v_mov_b32_e32 v14, 16 4571; GFX9-NEXT: v_mov_b32_e32 v15, 0 4572; GFX9-NEXT: s_waitcnt vmcnt(1) 4573; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 4574; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 4575; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 4576; GFX9-NEXT: s_waitcnt vmcnt(0) 4577; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] 4578; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] 4579; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] 4580; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] 4581; GFX9-NEXT: v_and_or_b32 v16, v3, v0, v2 4582; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13] 4583; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc 4584; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v16, s[0:1] 4585; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v16, s[2:3] 4586; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v16, s[4:5] 4587; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v16, s[6:7] 4588; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v16, s[8:9] 4589; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v16, s[10:11] 4590; GFX9-NEXT: global_store_dwordx4 v[12:13], v[0:3], off 4591; GFX9-NEXT: global_store_dwordx4 v[14:15], v[4:7], off 4592; GFX9-NEXT: s_endpgm 4593; 4594; GFX8-LABEL: insertelement_v_v16i16_v_v: 4595; GFX8: ; %bb.0: 4596; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 4597; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 4598; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4599; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] 4600; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v3 4601; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 4602; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff 4603; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 4604; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 4605; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4606; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 4607; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 4608; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 4609; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 4610; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 4611; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 4612; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 4613; GFX8-NEXT: v_not_b32_e32 v0, v0 4614; GFX8-NEXT: v_mov_b32_e32 v12, 0 4615; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 4616; GFX8-NEXT: v_mov_b32_e32 v13, 0 4617; GFX8-NEXT: v_mov_b32_e32 v14, 16 4618; GFX8-NEXT: v_mov_b32_e32 v15, 0 4619; GFX8-NEXT: s_waitcnt vmcnt(1) 4620; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 4621; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 4622; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 4623; GFX8-NEXT: s_waitcnt vmcnt(0) 4624; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] 4625; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] 4626; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] 4627; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] 4628; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 4629; GFX8-NEXT: v_or_b32_e32 v16, v0, v2 4630; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13] 4631; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc 4632; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v16, s[0:1] 4633; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v16, s[2:3] 4634; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v16, s[4:5] 4635; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v16, s[6:7] 4636; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v16, s[8:9] 4637; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v16, s[10:11] 4638; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 4639; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] 4640; GFX8-NEXT: s_endpgm 4641; 4642; GFX7-LABEL: insertelement_v_v16i16_v_v: 4643; GFX7: ; %bb.0: 4644; GFX7-NEXT: s_mov_b32 s18, 0 4645; GFX7-NEXT: s_mov_b32 s19, 0xf000 4646; GFX7-NEXT: s_mov_b64 s[16:17], 0 4647; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 4648; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 4649; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4650; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 4651; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 4652; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 4653; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 4654; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 4655; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 4656; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 4657; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 4658; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 4659; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 4660; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 4661; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 4662; GFX7-NEXT: v_not_b32_e32 v1, v1 4663; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 4664; GFX7-NEXT: s_mov_b64 s[16:17], 0 4665; GFX7-NEXT: s_mov_b32 s18, -1 4666; GFX7-NEXT: s_waitcnt vmcnt(1) 4667; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 4668; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 4669; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 4670; GFX7-NEXT: s_waitcnt vmcnt(0) 4671; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] 4672; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] 4673; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] 4674; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] 4675; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 4676; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 4677; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] 4678; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc 4679; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] 4680; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] 4681; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] 4682; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] 4683; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] 4684; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] 4685; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 4686; GFX7-NEXT: s_mov_b64 s[16:17], 16 4687; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 4688; GFX7-NEXT: s_endpgm 4689; 4690; GFX10-LABEL: insertelement_v_v16i16_v_v: 4691; GFX10: ; %bb.0: 4692; GFX10-NEXT: s_clause 0x1 4693; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 4694; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 4695; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4696; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 4697; GFX10-NEXT: v_mov_b32_e32 v14, 16 4698; GFX10-NEXT: v_mov_b32_e32 v15, 0 4699; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 4700; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 4701; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 4702; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 4703; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 4704; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 4705; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 4706; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 4707; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 4708; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff 4709; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4710; GFX10-NEXT: v_not_b32_e32 v3, v12 4711; GFX10-NEXT: v_mov_b32_e32 v12, 0 4712; GFX10-NEXT: v_mov_b32_e32 v13, 0 4713; GFX10-NEXT: s_waitcnt vmcnt(1) 4714; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo 4715; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 4716; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 4717; GFX10-NEXT: s_waitcnt vmcnt(0) 4718; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 4719; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 4720; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 4721; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 4722; GFX10-NEXT: v_and_or_b32 v16, v1, v3, v2 4723; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 4724; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo 4725; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 4726; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 4727; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 4728; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 4729; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 4730; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 4731; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off 4732; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off 4733; GFX10-NEXT: s_endpgm 4734; 4735; GFX11-LABEL: insertelement_v_v16i16_v_v: 4736; GFX11: ; %bb.0: 4737; GFX11-NEXT: s_clause 0x1 4738; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off 4739; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 4740; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v3 4741; GFX11-NEXT: v_dual_mov_b32 v14, 16 :: v_dual_and_b32 v3, 1, v3 4742; GFX11-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_and_b32 v2, 0xffff, v2 4743; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 4744; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 4745; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 4746; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 4747; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 4748; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 4749; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 4750; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 4751; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 4752; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 4753; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4754; GFX11-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff 4755; GFX11-NEXT: v_lshlrev_b32_e32 v2, v3, v2 4756; GFX11-NEXT: v_not_b32_e32 v3, v12 4757; GFX11-NEXT: v_mov_b32_e32 v12, 0 4758; GFX11-NEXT: v_mov_b32_e32 v13, 0 4759; GFX11-NEXT: s_waitcnt vmcnt(1) 4760; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo 4761; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4762; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 4763; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 4764; GFX11-NEXT: s_waitcnt vmcnt(0) 4765; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4766; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 4767; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 4768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4769; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 4770; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 4771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4772; GFX11-NEXT: v_and_or_b32 v16, v1, v3, v2 4773; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 4774; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo 4775; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 4776; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 4777; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 4778; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 4779; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 4780; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 4781; GFX11-NEXT: s_clause 0x1 4782; GFX11-NEXT: global_store_b128 v[12:13], v[0:3], off 4783; GFX11-NEXT: global_store_b128 v[14:15], v[4:7], off 4784; GFX11-NEXT: s_endpgm 4785 %vec = load <16 x i16>, ptr addrspace(1) %ptr 4786 %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx 4787 store <16 x i16> %insert, ptr addrspace(1) null 4788 ret void 4789} 4790