1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s 4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s 5; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 6 7define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { 8; GFX9-LABEL: s_insertelement_v2i16_0: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 11; GFX9-NEXT: v_mov_b32_e32 v0, 0 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 16; GFX9-NEXT: v_mov_b32_e32 v1, s2 17; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 18; GFX9-NEXT: s_endpgm 19; 20; CIVI-LABEL: s_insertelement_v2i16_0: 21; CIVI: ; %bb.0: 22; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 23; CIVI-NEXT: s_waitcnt lgkmcnt(0) 24; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 25; CIVI-NEXT: v_mov_b32_e32 v0, s0 26; CIVI-NEXT: v_mov_b32_e32 v1, s1 27; CIVI-NEXT: s_waitcnt lgkmcnt(0) 28; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 29; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 30; CIVI-NEXT: v_mov_b32_e32 v2, s0 31; CIVI-NEXT: flat_store_dword v[0:1], v2 32; CIVI-NEXT: s_endpgm 33; 34; GFX11-LABEL: s_insertelement_v2i16_0: 35; GFX11: ; %bb.0: 36; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 37; GFX11-NEXT: s_waitcnt lgkmcnt(0) 38; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 39; GFX11-NEXT: s_waitcnt lgkmcnt(0) 40; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 41; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 42; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 43; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 44; GFX11-NEXT: s_endpgm 45 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 46 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 47 store <2 x i16> %vecins, ptr addrspace(1) %out 48 ret void 49} 50 51 52define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { 53; GFX9-LABEL: s_insertelement_v2i16_0_reg: 54; GFX9: ; %bb.0: 55; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 56; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 57; GFX9-NEXT: v_mov_b32_e32 v0, 0 58; GFX9-NEXT: s_waitcnt lgkmcnt(0) 59; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 60; GFX9-NEXT: s_waitcnt lgkmcnt(0) 61; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 62; GFX9-NEXT: v_mov_b32_e32 v1, s2 63; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 64; GFX9-NEXT: s_endpgm 65; 66; VI-LABEL: s_insertelement_v2i16_0_reg: 67; VI: ; %bb.0: 68; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 69; VI-NEXT: s_load_dword s4, s[8:9], 0x30 70; VI-NEXT: s_waitcnt lgkmcnt(0) 71; VI-NEXT: s_load_dword s2, s[2:3], 0x0 72; VI-NEXT: v_mov_b32_e32 v0, s0 73; VI-NEXT: v_mov_b32_e32 v1, s1 74; VI-NEXT: s_and_b32 s0, s4, 0xffff 75; VI-NEXT: s_waitcnt lgkmcnt(0) 76; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 77; VI-NEXT: s_or_b32 s0, s0, s1 78; VI-NEXT: v_mov_b32_e32 v2, s0 79; VI-NEXT: flat_store_dword v[0:1], v2 80; VI-NEXT: s_endpgm 81; 82; CI-LABEL: s_insertelement_v2i16_0_reg: 83; CI: ; %bb.0: 84; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 85; CI-NEXT: s_load_dword s4, s[8:9], 0xc 86; CI-NEXT: s_waitcnt lgkmcnt(0) 87; CI-NEXT: s_load_dword s2, s[2:3], 0x0 88; CI-NEXT: v_mov_b32_e32 v0, s0 89; CI-NEXT: v_mov_b32_e32 v1, s1 90; CI-NEXT: s_and_b32 s1, s4, 0xffff 91; CI-NEXT: s_waitcnt lgkmcnt(0) 92; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 93; CI-NEXT: s_or_b32 s0, s1, s0 94; CI-NEXT: v_mov_b32_e32 v2, s0 95; CI-NEXT: flat_store_dword v[0:1], v2 96; CI-NEXT: s_endpgm 97; 98; GFX11-LABEL: s_insertelement_v2i16_0_reg: 99; GFX11: ; %bb.0: 100; GFX11-NEXT: s_clause 0x1 101; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 102; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 103; GFX11-NEXT: s_waitcnt lgkmcnt(0) 104; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 105; GFX11-NEXT: s_waitcnt lgkmcnt(0) 106; GFX11-NEXT: s_pack_lh_b32_b16 s2, s4, s2 107; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 108; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 109; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 110; GFX11-NEXT: s_endpgm 111 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 112 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 113 store <2 x i16> %vecins, ptr addrspace(1) %out 114 ret void 115} 116 117define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { 118; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 119; GFX9: ; %bb.0: 120; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 121; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 122; GFX9-NEXT: v_mov_b32_e32 v0, 0 123; GFX9-NEXT: s_waitcnt lgkmcnt(0) 124; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 125; GFX9-NEXT: s_waitcnt lgkmcnt(0) 126; GFX9-NEXT: s_lshr_b32 s2, s2, 16 127; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 128; GFX9-NEXT: v_mov_b32_e32 v1, s3 129; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 130; GFX9-NEXT: ;;#ASMSTART 131; GFX9-NEXT: ; use s2 132; GFX9-NEXT: ;;#ASMEND 133; GFX9-NEXT: s_endpgm 134; 135; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 136; VI: ; %bb.0: 137; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 138; VI-NEXT: s_load_dword s4, s[8:9], 0x30 139; VI-NEXT: s_waitcnt lgkmcnt(0) 140; VI-NEXT: s_load_dword s2, s[2:3], 0x0 141; VI-NEXT: v_mov_b32_e32 v0, s0 142; VI-NEXT: v_mov_b32_e32 v1, s1 143; VI-NEXT: s_and_b32 s0, s4, 0xffff 144; VI-NEXT: s_waitcnt lgkmcnt(0) 145; VI-NEXT: s_lshr_b32 s1, s2, 16 146; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 147; VI-NEXT: s_or_b32 s0, s0, s2 148; VI-NEXT: v_mov_b32_e32 v2, s0 149; VI-NEXT: flat_store_dword v[0:1], v2 150; VI-NEXT: ;;#ASMSTART 151; VI-NEXT: ; use s1 152; VI-NEXT: ;;#ASMEND 153; VI-NEXT: s_endpgm 154; 155; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 156; CI: ; %bb.0: 157; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 158; CI-NEXT: s_load_dword s4, s[8:9], 0xc 159; CI-NEXT: s_waitcnt lgkmcnt(0) 160; CI-NEXT: s_load_dword s2, s[2:3], 0x0 161; CI-NEXT: v_mov_b32_e32 v0, s0 162; CI-NEXT: v_mov_b32_e32 v1, s1 163; CI-NEXT: s_and_b32 s0, s4, 0xffff 164; CI-NEXT: s_waitcnt lgkmcnt(0) 165; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 166; CI-NEXT: s_or_b32 s0, s0, s1 167; CI-NEXT: v_mov_b32_e32 v2, s0 168; CI-NEXT: s_lshr_b32 s2, s2, 16 169; CI-NEXT: flat_store_dword v[0:1], v2 170; CI-NEXT: ;;#ASMSTART 171; CI-NEXT: ; use s2 172; CI-NEXT: ;;#ASMEND 173; CI-NEXT: s_endpgm 174; 175; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 176; GFX11: ; %bb.0: 177; GFX11-NEXT: s_clause 0x1 178; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 179; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 180; GFX11-NEXT: s_waitcnt lgkmcnt(0) 181; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 182; GFX11-NEXT: s_waitcnt lgkmcnt(0) 183; GFX11-NEXT: s_lshr_b32 s2, s2, 16 184; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 185; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s2 186; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 187; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 188; GFX11-NEXT: ;;#ASMSTART 189; GFX11-NEXT: ; use s2 190; GFX11-NEXT: ;;#ASMEND 191; GFX11-NEXT: s_endpgm 192 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 193 %elt1 = extractelement <2 x i16> %vec, i32 1 194 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 195 store <2 x i16> %vecins, ptr addrspace(1) %out 196 %use1 = zext i16 %elt1 to i32 197 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 198 ret void 199} 200 201define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 { 202; GFX9-LABEL: s_insertelement_v2i16_0_reghi: 203; GFX9: ; %bb.0: 204; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 205; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 206; GFX9-NEXT: v_mov_b32_e32 v0, 0 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 209; GFX9-NEXT: s_waitcnt lgkmcnt(0) 210; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 211; GFX9-NEXT: v_mov_b32_e32 v1, s2 212; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 213; GFX9-NEXT: s_endpgm 214; 215; VI-LABEL: s_insertelement_v2i16_0_reghi: 216; VI: ; %bb.0: 217; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 218; VI-NEXT: s_load_dword s4, s[8:9], 0x30 219; VI-NEXT: s_waitcnt lgkmcnt(0) 220; VI-NEXT: s_load_dword s2, s[2:3], 0x0 221; VI-NEXT: v_mov_b32_e32 v0, s0 222; VI-NEXT: v_mov_b32_e32 v2, s4 223; VI-NEXT: v_mov_b32_e32 v1, s1 224; VI-NEXT: s_waitcnt lgkmcnt(0) 225; VI-NEXT: s_lshr_b32 s0, s2, 16 226; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 227; VI-NEXT: flat_store_dword v[0:1], v2 228; VI-NEXT: s_endpgm 229; 230; CI-LABEL: s_insertelement_v2i16_0_reghi: 231; CI: ; %bb.0: 232; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 233; CI-NEXT: s_load_dword s4, s[8:9], 0xc 234; CI-NEXT: s_waitcnt lgkmcnt(0) 235; CI-NEXT: s_load_dword s2, s[2:3], 0x0 236; CI-NEXT: v_mov_b32_e32 v0, s0 237; CI-NEXT: v_mov_b32_e32 v2, s4 238; CI-NEXT: v_mov_b32_e32 v1, s1 239; CI-NEXT: s_waitcnt lgkmcnt(0) 240; CI-NEXT: s_lshr_b32 s0, s2, 16 241; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 242; CI-NEXT: flat_store_dword v[0:1], v2 243; CI-NEXT: s_endpgm 244; 245; GFX11-LABEL: s_insertelement_v2i16_0_reghi: 246; GFX11: ; %bb.0: 247; GFX11-NEXT: s_clause 0x1 248; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 249; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 250; GFX11-NEXT: s_waitcnt lgkmcnt(0) 251; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 252; GFX11-NEXT: s_waitcnt lgkmcnt(0) 253; GFX11-NEXT: s_pack_hh_b32_b16 s2, s4, s2 254; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 255; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 256; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 257; GFX11-NEXT: s_endpgm 258 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 259 %elt.hi = lshr i32 %elt.arg, 16 260 %elt = trunc i32 %elt.hi to i16 261 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 262 store <2 x i16> %vecins, ptr addrspace(1) %out 263 ret void 264} 265 266define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { 267; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 268; GFX9: ; %bb.0: 269; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 270; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 271; GFX9-NEXT: v_mov_b32_e32 v0, 0 272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 273; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 274; GFX9-NEXT: s_lshr_b32 s3, s4, 16 275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 276; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 277; GFX9-NEXT: v_mov_b32_e32 v1, s2 278; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 279; GFX9-NEXT: ;;#ASMSTART 280; GFX9-NEXT: ; use s3 281; GFX9-NEXT: ;;#ASMEND 282; GFX9-NEXT: s_endpgm 283; 284; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 285; VI: ; %bb.0: 286; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 287; VI-NEXT: s_load_dword s4, s[8:9], 0x10 288; VI-NEXT: s_waitcnt lgkmcnt(0) 289; VI-NEXT: s_load_dword s2, s[2:3], 0x0 290; VI-NEXT: v_mov_b32_e32 v1, s1 291; VI-NEXT: v_mov_b32_e32 v2, s4 292; VI-NEXT: v_mov_b32_e32 v0, s0 293; VI-NEXT: s_lshr_b32 s0, s4, 16 294; VI-NEXT: s_waitcnt lgkmcnt(0) 295; VI-NEXT: s_lshr_b32 s1, s2, 16 296; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 297; VI-NEXT: flat_store_dword v[0:1], v2 298; VI-NEXT: ;;#ASMSTART 299; VI-NEXT: ; use s0 300; VI-NEXT: ;;#ASMEND 301; VI-NEXT: s_endpgm 302; 303; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 304; CI: ; %bb.0: 305; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 306; CI-NEXT: s_load_dword s4, s[8:9], 0x4 307; CI-NEXT: s_waitcnt lgkmcnt(0) 308; CI-NEXT: s_load_dword s2, s[2:3], 0x0 309; CI-NEXT: v_mov_b32_e32 v1, s1 310; CI-NEXT: v_mov_b32_e32 v2, s4 311; CI-NEXT: v_mov_b32_e32 v0, s0 312; CI-NEXT: s_lshr_b32 s0, s4, 16 313; CI-NEXT: s_waitcnt lgkmcnt(0) 314; CI-NEXT: s_lshr_b32 s1, s2, 16 315; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 316; CI-NEXT: flat_store_dword v[0:1], v2 317; CI-NEXT: ;;#ASMSTART 318; CI-NEXT: ; use s0 319; CI-NEXT: ;;#ASMEND 320; CI-NEXT: s_endpgm 321; 322; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 323; GFX11: ; %bb.0: 324; GFX11-NEXT: s_clause 0x1 325; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 326; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 327; GFX11-NEXT: s_waitcnt lgkmcnt(0) 328; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 329; GFX11-NEXT: s_lshr_b32 s3, s4, 16 330; GFX11-NEXT: s_waitcnt lgkmcnt(0) 331; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2 332; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 333; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 334; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 335; GFX11-NEXT: ;;#ASMSTART 336; GFX11-NEXT: ; use s3 337; GFX11-NEXT: ;;#ASMEND 338; GFX11-NEXT: s_endpgm 339 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 340 %elt.hi = lshr i32 %elt.arg, 16 341 %elt = trunc i32 %elt.hi to i16 342 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 343 store <2 x i16> %vecins, ptr addrspace(1) %out 344 %use1 = zext i16 %elt to i32 345 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 346 ret void 347} 348 349define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { 350; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 351; GFX9: ; %bb.0: 352; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 353; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 354; GFX9-NEXT: v_mov_b32_e32 v0, 0 355; GFX9-NEXT: s_waitcnt lgkmcnt(0) 356; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 357; GFX9-NEXT: s_lshr_b32 s3, s4, 16 358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 359; GFX9-NEXT: s_lshr_b32 s2, s2, 16 360; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 361; GFX9-NEXT: v_mov_b32_e32 v1, s4 362; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 363; GFX9-NEXT: ;;#ASMSTART 364; GFX9-NEXT: ; use s3 365; GFX9-NEXT: ;;#ASMEND 366; GFX9-NEXT: ;;#ASMSTART 367; GFX9-NEXT: ; use s2 368; GFX9-NEXT: ;;#ASMEND 369; GFX9-NEXT: s_endpgm 370; 371; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 372; VI: ; %bb.0: 373; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 374; VI-NEXT: s_load_dword s4, s[8:9], 0x10 375; VI-NEXT: s_waitcnt lgkmcnt(0) 376; VI-NEXT: s_load_dword s2, s[2:3], 0x0 377; VI-NEXT: v_mov_b32_e32 v1, s1 378; VI-NEXT: v_mov_b32_e32 v2, s4 379; VI-NEXT: v_mov_b32_e32 v0, s0 380; VI-NEXT: s_lshr_b32 s0, s4, 16 381; VI-NEXT: s_waitcnt lgkmcnt(0) 382; VI-NEXT: s_lshr_b32 s1, s2, 16 383; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 384; VI-NEXT: flat_store_dword v[0:1], v2 385; VI-NEXT: ;;#ASMSTART 386; VI-NEXT: ; use s0 387; VI-NEXT: ;;#ASMEND 388; VI-NEXT: ;;#ASMSTART 389; VI-NEXT: ; use s1 390; VI-NEXT: ;;#ASMEND 391; VI-NEXT: s_endpgm 392; 393; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 394; CI: ; %bb.0: 395; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 396; CI-NEXT: s_load_dword s4, s[8:9], 0x4 397; CI-NEXT: s_waitcnt lgkmcnt(0) 398; CI-NEXT: s_load_dword s2, s[2:3], 0x0 399; CI-NEXT: v_mov_b32_e32 v1, s1 400; CI-NEXT: v_mov_b32_e32 v2, s4 401; CI-NEXT: v_mov_b32_e32 v0, s0 402; CI-NEXT: s_lshr_b32 s0, s4, 16 403; CI-NEXT: s_waitcnt lgkmcnt(0) 404; CI-NEXT: s_lshr_b32 s1, s2, 16 405; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 406; CI-NEXT: flat_store_dword v[0:1], v2 407; CI-NEXT: ;;#ASMSTART 408; CI-NEXT: ; use s0 409; CI-NEXT: ;;#ASMEND 410; CI-NEXT: ;;#ASMSTART 411; CI-NEXT: ; use s1 412; CI-NEXT: ;;#ASMEND 413; CI-NEXT: s_endpgm 414; 415; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 416; GFX11: ; %bb.0: 417; GFX11-NEXT: s_clause 0x1 418; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 419; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 420; GFX11-NEXT: s_waitcnt lgkmcnt(0) 421; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 422; GFX11-NEXT: s_lshr_b32 s3, s4, 16 423; GFX11-NEXT: s_waitcnt lgkmcnt(0) 424; GFX11-NEXT: s_lshr_b32 s2, s2, 16 425; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 426; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2 427; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 428; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 429; GFX11-NEXT: ;;#ASMSTART 430; GFX11-NEXT: ; use s3 431; GFX11-NEXT: ;;#ASMEND 432; GFX11-NEXT: ;;#ASMSTART 433; GFX11-NEXT: ; use s2 434; GFX11-NEXT: ;;#ASMEND 435; GFX11-NEXT: s_endpgm 436 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 437 %elt.hi = lshr i32 %elt.arg, 16 438 %elt = trunc i32 %elt.hi to i16 439 %vec.hi = extractelement <2 x i16> %vec, i32 1 440 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 441 store <2 x i16> %vecins, ptr addrspace(1) %out 442 %use1 = zext i16 %elt to i32 443 %vec.hi.use1 = zext i16 %vec.hi to i32 444 445 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 446 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 447 ret void 448} 449 450define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { 451; GFX9-LABEL: s_insertelement_v2i16_1: 452; GFX9: ; %bb.0: 453; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 454; GFX9-NEXT: v_mov_b32_e32 v0, 0 455; GFX9-NEXT: s_waitcnt lgkmcnt(0) 456; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 457; GFX9-NEXT: s_waitcnt lgkmcnt(0) 458; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 459; GFX9-NEXT: v_mov_b32_e32 v1, s2 460; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 461; GFX9-NEXT: s_endpgm 462; 463; CIVI-LABEL: s_insertelement_v2i16_1: 464; CIVI: ; %bb.0: 465; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 466; CIVI-NEXT: s_waitcnt lgkmcnt(0) 467; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 468; CIVI-NEXT: v_mov_b32_e32 v0, s0 469; CIVI-NEXT: v_mov_b32_e32 v1, s1 470; CIVI-NEXT: s_waitcnt lgkmcnt(0) 471; CIVI-NEXT: s_and_b32 s0, s2, 0xffff 472; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 473; CIVI-NEXT: v_mov_b32_e32 v2, s0 474; CIVI-NEXT: flat_store_dword v[0:1], v2 475; CIVI-NEXT: s_endpgm 476; 477; GFX11-LABEL: s_insertelement_v2i16_1: 478; GFX11: ; %bb.0: 479; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 480; GFX11-NEXT: s_waitcnt lgkmcnt(0) 481; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 482; GFX11-NEXT: s_waitcnt lgkmcnt(0) 483; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 484; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 485; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 486; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 487; GFX11-NEXT: s_endpgm 488 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 489 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 490 store <2 x i16> %vecins, ptr addrspace(1) %out 491 ret void 492} 493 494define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { 495; GFX9-LABEL: s_insertelement_v2i16_1_reg: 496; GFX9: ; %bb.0: 497; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 498; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 499; GFX9-NEXT: v_mov_b32_e32 v0, 0 500; GFX9-NEXT: s_waitcnt lgkmcnt(0) 501; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 502; GFX9-NEXT: s_waitcnt lgkmcnt(0) 503; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 504; GFX9-NEXT: v_mov_b32_e32 v1, s2 505; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 506; GFX9-NEXT: s_endpgm 507; 508; VI-LABEL: s_insertelement_v2i16_1_reg: 509; VI: ; %bb.0: 510; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 511; VI-NEXT: s_load_dword s4, s[8:9], 0x30 512; VI-NEXT: s_waitcnt lgkmcnt(0) 513; VI-NEXT: s_load_dword s2, s[2:3], 0x0 514; VI-NEXT: v_mov_b32_e32 v0, s0 515; VI-NEXT: v_mov_b32_e32 v1, s1 516; VI-NEXT: s_lshl_b32 s0, s4, 16 517; VI-NEXT: s_waitcnt lgkmcnt(0) 518; VI-NEXT: s_and_b32 s1, s2, 0xffff 519; VI-NEXT: s_or_b32 s0, s1, s0 520; VI-NEXT: v_mov_b32_e32 v2, s0 521; VI-NEXT: flat_store_dword v[0:1], v2 522; VI-NEXT: s_endpgm 523; 524; CI-LABEL: s_insertelement_v2i16_1_reg: 525; CI: ; %bb.0: 526; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 527; CI-NEXT: s_load_dword s4, s[8:9], 0xc 528; CI-NEXT: s_waitcnt lgkmcnt(0) 529; CI-NEXT: s_load_dword s2, s[2:3], 0x0 530; CI-NEXT: v_mov_b32_e32 v0, s0 531; CI-NEXT: v_mov_b32_e32 v1, s1 532; CI-NEXT: s_lshl_b32 s1, s4, 16 533; CI-NEXT: s_waitcnt lgkmcnt(0) 534; CI-NEXT: s_and_b32 s0, s2, 0xffff 535; CI-NEXT: s_or_b32 s0, s0, s1 536; CI-NEXT: v_mov_b32_e32 v2, s0 537; CI-NEXT: flat_store_dword v[0:1], v2 538; CI-NEXT: s_endpgm 539; 540; GFX11-LABEL: s_insertelement_v2i16_1_reg: 541; GFX11: ; %bb.0: 542; GFX11-NEXT: s_clause 0x1 543; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 544; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 545; GFX11-NEXT: s_waitcnt lgkmcnt(0) 546; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 547; GFX11-NEXT: s_waitcnt lgkmcnt(0) 548; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 549; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 550; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 551; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 552; GFX11-NEXT: s_endpgm 553 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 554 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 555 store <2 x i16> %vecins, ptr addrspace(1) %out 556 ret void 557} 558 559define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { 560; GFX9-LABEL: s_insertelement_v2f16_0: 561; GFX9: ; %bb.0: 562; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 563; GFX9-NEXT: v_mov_b32_e32 v0, 0 564; GFX9-NEXT: s_waitcnt lgkmcnt(0) 565; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 566; GFX9-NEXT: s_waitcnt lgkmcnt(0) 567; GFX9-NEXT: s_lshr_b32 s2, s2, 16 568; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 569; GFX9-NEXT: v_mov_b32_e32 v1, s2 570; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 571; GFX9-NEXT: s_endpgm 572; 573; CIVI-LABEL: s_insertelement_v2f16_0: 574; CIVI: ; %bb.0: 575; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 576; CIVI-NEXT: s_waitcnt lgkmcnt(0) 577; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 578; CIVI-NEXT: v_mov_b32_e32 v0, s0 579; CIVI-NEXT: v_mov_b32_e32 v1, s1 580; CIVI-NEXT: s_waitcnt lgkmcnt(0) 581; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 582; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 583; CIVI-NEXT: v_mov_b32_e32 v2, s0 584; CIVI-NEXT: flat_store_dword v[0:1], v2 585; CIVI-NEXT: s_endpgm 586; 587; GFX11-LABEL: s_insertelement_v2f16_0: 588; GFX11: ; %bb.0: 589; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 590; GFX11-NEXT: s_waitcnt lgkmcnt(0) 591; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 592; GFX11-NEXT: s_waitcnt lgkmcnt(0) 593; GFX11-NEXT: s_lshr_b32 s2, s2, 16 594; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 595; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 596; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 597; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 598; GFX11-NEXT: s_endpgm 599 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr 600 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 601 store <2 x half> %vecins, ptr addrspace(1) %out 602 ret void 603} 604 605define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { 606; GFX9-LABEL: s_insertelement_v2f16_1: 607; GFX9: ; %bb.0: 608; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 609; GFX9-NEXT: v_mov_b32_e32 v0, 0 610; GFX9-NEXT: s_waitcnt lgkmcnt(0) 611; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 612; GFX9-NEXT: s_waitcnt lgkmcnt(0) 613; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 614; GFX9-NEXT: v_mov_b32_e32 v1, s2 615; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 616; GFX9-NEXT: s_endpgm 617; 618; CIVI-LABEL: s_insertelement_v2f16_1: 619; CIVI: ; %bb.0: 620; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 621; CIVI-NEXT: s_waitcnt lgkmcnt(0) 622; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 623; CIVI-NEXT: v_mov_b32_e32 v0, s0 624; CIVI-NEXT: v_mov_b32_e32 v1, s1 625; CIVI-NEXT: s_waitcnt lgkmcnt(0) 626; CIVI-NEXT: s_and_b32 s0, s2, 0xffff 627; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 628; CIVI-NEXT: v_mov_b32_e32 v2, s0 629; CIVI-NEXT: flat_store_dword v[0:1], v2 630; CIVI-NEXT: s_endpgm 631; 632; GFX11-LABEL: s_insertelement_v2f16_1: 633; GFX11: ; %bb.0: 634; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 635; GFX11-NEXT: s_waitcnt lgkmcnt(0) 636; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 637; GFX11-NEXT: s_waitcnt lgkmcnt(0) 638; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 639; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 640; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 641; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 642; GFX11-NEXT: s_endpgm 643 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr 644 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 645 store <2 x half> %vecins, ptr addrspace(1) %out 646 ret void 647} 648 649define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 650; GFX9-LABEL: v_insertelement_v2i16_0: 651; GFX9: ; %bb.0: 652; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 653; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 654; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 657; GFX9-NEXT: s_mov_b32 s2, 0xffff 658; GFX9-NEXT: s_waitcnt vmcnt(0) 659; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 660; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 661; GFX9-NEXT: s_endpgm 662; 663; VI-LABEL: v_insertelement_v2i16_0: 664; VI: ; %bb.0: 665; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 666; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 667; VI-NEXT: s_waitcnt lgkmcnt(0) 668; VI-NEXT: v_mov_b32_e32 v1, s3 669; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 670; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 671; VI-NEXT: flat_load_dword v3, v[0:1] 672; VI-NEXT: v_mov_b32_e32 v1, s1 673; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 674; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 675; VI-NEXT: s_waitcnt vmcnt(0) 676; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 677; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 678; VI-NEXT: flat_store_dword v[0:1], v2 679; VI-NEXT: s_endpgm 680; 681; CI-LABEL: v_insertelement_v2i16_0: 682; CI: ; %bb.0: 683; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 684; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 685; CI-NEXT: s_waitcnt lgkmcnt(0) 686; CI-NEXT: v_mov_b32_e32 v1, s3 687; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 688; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 689; CI-NEXT: flat_load_dword v3, v[0:1] 690; CI-NEXT: v_mov_b32_e32 v1, s1 691; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 692; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 693; CI-NEXT: s_waitcnt vmcnt(0) 694; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 695; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 696; CI-NEXT: flat_store_dword v[0:1], v2 697; CI-NEXT: s_endpgm 698; 699; GFX11-LABEL: v_insertelement_v2i16_0: 700; GFX11: ; %bb.0: 701; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 702; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 703; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 704; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 705; GFX11-NEXT: s_waitcnt lgkmcnt(0) 706; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 707; GFX11-NEXT: s_movk_i32 s2, 0x3e7 708; GFX11-NEXT: s_waitcnt vmcnt(0) 709; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 710; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 711; GFX11-NEXT: s_endpgm 712 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 713 %tid.ext = sext i32 %tid to i64 714 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 715 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 716 %vec = load <2 x i16>, ptr addrspace(1) %in.gep 717 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 718 store <2 x i16> %vecins, ptr addrspace(1) %out.gep 719 ret void 720} 721 722define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 { 723; GFX9-LABEL: v_insertelement_v2i16_0_reghi: 724; GFX9: ; %bb.0: 725; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 726; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 727; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 728; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 729; GFX9-NEXT: s_waitcnt lgkmcnt(0) 730; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 731; GFX9-NEXT: s_waitcnt vmcnt(0) 732; GFX9-NEXT: v_perm_b32 v1, v1, s4, v2 733; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 734; GFX9-NEXT: s_endpgm 735; 736; VI-LABEL: v_insertelement_v2i16_0_reghi: 737; VI: ; %bb.0: 738; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 739; VI-NEXT: s_load_dword s4, s[8:9], 0x10 740; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 741; VI-NEXT: s_waitcnt lgkmcnt(0) 742; VI-NEXT: v_mov_b32_e32 v1, s3 743; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 744; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 745; VI-NEXT: flat_load_dword v3, v[0:1] 746; VI-NEXT: v_mov_b32_e32 v1, s1 747; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 748; VI-NEXT: v_mov_b32_e32 v2, 0x3020706 749; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 750; VI-NEXT: s_waitcnt vmcnt(0) 751; VI-NEXT: v_perm_b32 v2, s4, v3, v2 752; VI-NEXT: flat_store_dword v[0:1], v2 753; VI-NEXT: s_endpgm 754; 755; CI-LABEL: v_insertelement_v2i16_0_reghi: 756; CI: ; %bb.0: 757; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 758; CI-NEXT: s_load_dword s4, s[8:9], 0x4 759; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 760; CI-NEXT: s_waitcnt lgkmcnt(0) 761; CI-NEXT: v_mov_b32_e32 v1, s3 762; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 763; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 764; CI-NEXT: flat_load_dword v3, v[0:1] 765; CI-NEXT: v_mov_b32_e32 v1, s1 766; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 767; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 768; CI-NEXT: s_waitcnt vmcnt(0) 769; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 770; CI-NEXT: v_alignbit_b32 v2, v2, s4, 16 771; CI-NEXT: flat_store_dword v[0:1], v2 772; CI-NEXT: s_endpgm 773; 774; GFX11-LABEL: v_insertelement_v2i16_0_reghi: 775; GFX11: ; %bb.0: 776; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 777; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 778; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 779; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 780; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 781; GFX11-NEXT: s_waitcnt lgkmcnt(0) 782; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 783; GFX11-NEXT: s_waitcnt vmcnt(0) 784; GFX11-NEXT: v_perm_b32 v1, v1, s4, 0x7060302 785; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 786; GFX11-NEXT: s_endpgm 787 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 788 %tid.ext = sext i32 %tid to i64 789 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 790 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 791 %vec = load <2 x i16>, ptr addrspace(1) %in.gep 792 %elt.hi = lshr i32 %elt.arg, 16 793 %elt = trunc i32 %elt.hi to i16 794 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 795 store <2 x i16> %vecins, ptr addrspace(1) %out.gep 796 ret void 797} 798 799define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 800; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: 801; GFX9: ; %bb.0: 802; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 803; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 805; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 806; GFX9-NEXT: s_mov_b32 s2, 0xffff 807; GFX9-NEXT: s_waitcnt vmcnt(0) 808; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1 809; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 810; GFX9-NEXT: s_endpgm 811; 812; VI-LABEL: v_insertelement_v2i16_0_inlineimm: 813; VI: ; %bb.0: 814; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 815; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 816; VI-NEXT: s_waitcnt lgkmcnt(0) 817; VI-NEXT: v_mov_b32_e32 v1, s3 818; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 819; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 820; VI-NEXT: flat_load_dword v3, v[0:1] 821; VI-NEXT: v_mov_b32_e32 v1, s1 822; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 823; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 824; VI-NEXT: s_waitcnt vmcnt(0) 825; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 826; VI-NEXT: v_or_b32_e32 v2, 53, v2 827; VI-NEXT: flat_store_dword v[0:1], v2 828; VI-NEXT: s_endpgm 829; 830; CI-LABEL: v_insertelement_v2i16_0_inlineimm: 831; CI: ; %bb.0: 832; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 833; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 834; CI-NEXT: s_waitcnt lgkmcnt(0) 835; CI-NEXT: v_mov_b32_e32 v1, s3 836; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 837; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 838; CI-NEXT: flat_load_dword v3, v[0:1] 839; CI-NEXT: v_mov_b32_e32 v1, s1 840; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 841; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 842; CI-NEXT: s_waitcnt vmcnt(0) 843; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 844; CI-NEXT: v_or_b32_e32 v2, 53, v2 845; CI-NEXT: flat_store_dword v[0:1], v2 846; CI-NEXT: s_endpgm 847; 848; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: 849; GFX11: ; %bb.0: 850; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 851; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 852; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 853; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 854; GFX11-NEXT: s_waitcnt lgkmcnt(0) 855; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 856; GFX11-NEXT: s_waitcnt vmcnt(0) 857; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 858; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 859; GFX11-NEXT: s_endpgm 860 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 861 %tid.ext = sext i32 %tid to i64 862 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 863 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 864 %vec = load <2 x i16>, ptr addrspace(1) %in.gep 865 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0 866 store <2 x i16> %vecins, ptr addrspace(1) %out.gep 867 ret void 868} 869 870; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 871define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 872; GFX9-LABEL: v_insertelement_v2i16_1: 873; GFX9: ; %bb.0: 874; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 875; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 876; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 877; GFX9-NEXT: s_waitcnt lgkmcnt(0) 878; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 879; GFX9-NEXT: s_movk_i32 s2, 0x3e7 880; GFX9-NEXT: s_waitcnt vmcnt(0) 881; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2 882; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 883; GFX9-NEXT: s_endpgm 884; 885; VI-LABEL: v_insertelement_v2i16_1: 886; VI: ; %bb.0: 887; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 888; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 889; VI-NEXT: s_waitcnt lgkmcnt(0) 890; VI-NEXT: v_mov_b32_e32 v1, s3 891; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 892; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 893; VI-NEXT: flat_load_dword v3, v[0:1] 894; VI-NEXT: v_mov_b32_e32 v1, s1 895; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 896; VI-NEXT: v_mov_b32_e32 v2, 0x3e70000 897; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 898; VI-NEXT: s_waitcnt vmcnt(0) 899; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 900; VI-NEXT: flat_store_dword v[0:1], v2 901; VI-NEXT: s_endpgm 902; 903; CI-LABEL: v_insertelement_v2i16_1: 904; CI: ; %bb.0: 905; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 906; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 907; CI-NEXT: s_waitcnt lgkmcnt(0) 908; CI-NEXT: v_mov_b32_e32 v1, s3 909; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 910; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 911; CI-NEXT: flat_load_dword v3, v[0:1] 912; CI-NEXT: v_mov_b32_e32 v1, s1 913; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 914; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 915; CI-NEXT: s_waitcnt vmcnt(0) 916; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 917; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 918; CI-NEXT: flat_store_dword v[0:1], v2 919; CI-NEXT: s_endpgm 920; 921; GFX11-LABEL: v_insertelement_v2i16_1: 922; GFX11: ; %bb.0: 923; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 924; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 925; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 926; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 927; GFX11-NEXT: s_waitcnt lgkmcnt(0) 928; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 929; GFX11-NEXT: s_movk_i32 s2, 0x3e7 930; GFX11-NEXT: s_waitcnt vmcnt(0) 931; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 932; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 933; GFX11-NEXT: s_endpgm 934 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 935 %tid.ext = sext i32 %tid to i64 936 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 937 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 938 %vec = load <2 x i16>, ptr addrspace(1) %in.gep 939 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 940 store <2 x i16> %vecins, ptr addrspace(1) %out.gep 941 ret void 942} 943 944define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 945; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: 946; GFX9: ; %bb.0: 947; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 948; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 949; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 950; GFX9-NEXT: s_waitcnt lgkmcnt(0) 951; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 952; GFX9-NEXT: s_waitcnt vmcnt(0) 953; GFX9-NEXT: v_perm_b32 v1, -15, v1, v2 954; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 955; GFX9-NEXT: s_endpgm 956; 957; VI-LABEL: v_insertelement_v2i16_1_inlineimm: 958; VI: ; %bb.0: 959; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 960; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 961; VI-NEXT: s_waitcnt lgkmcnt(0) 962; VI-NEXT: v_mov_b32_e32 v1, s3 963; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 964; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 965; VI-NEXT: flat_load_dword v3, v[0:1] 966; VI-NEXT: v_mov_b32_e32 v1, s1 967; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 968; VI-NEXT: v_mov_b32_e32 v2, 0xfff10000 969; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 970; VI-NEXT: s_waitcnt vmcnt(0) 971; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 972; VI-NEXT: flat_store_dword v[0:1], v2 973; VI-NEXT: s_endpgm 974; 975; CI-LABEL: v_insertelement_v2i16_1_inlineimm: 976; CI: ; %bb.0: 977; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 978; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 979; CI-NEXT: s_waitcnt lgkmcnt(0) 980; CI-NEXT: v_mov_b32_e32 v1, s3 981; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 982; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 983; CI-NEXT: flat_load_dword v3, v[0:1] 984; CI-NEXT: v_mov_b32_e32 v1, s1 985; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 986; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 987; CI-NEXT: s_waitcnt vmcnt(0) 988; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 989; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 990; CI-NEXT: flat_store_dword v[0:1], v2 991; CI-NEXT: s_endpgm 992; 993; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: 994; GFX11: ; %bb.0: 995; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 996; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 998; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 999; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1001; GFX11-NEXT: s_waitcnt vmcnt(0) 1002; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100 1003; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1004; GFX11-NEXT: s_endpgm 1005 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1006 %tid.ext = sext i32 %tid to i64 1007 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 1008 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 1009 %vec = load <2 x i16>, ptr addrspace(1) %in.gep 1010 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1 1011 store <2 x i16> %vecins, ptr addrspace(1) %out.gep 1012 ret void 1013} 1014 1015define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1016; GFX9-LABEL: v_insertelement_v2f16_0: 1017; GFX9: ; %bb.0: 1018; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1019; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1020; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 1021; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1022; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1023; GFX9-NEXT: s_mov_b32 s2, 0xffff 1024; GFX9-NEXT: s_waitcnt vmcnt(0) 1025; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 1026; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1027; GFX9-NEXT: s_endpgm 1028; 1029; VI-LABEL: v_insertelement_v2f16_0: 1030; VI: ; %bb.0: 1031; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1032; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1033; VI-NEXT: s_waitcnt lgkmcnt(0) 1034; VI-NEXT: v_mov_b32_e32 v1, s3 1035; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1036; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1037; VI-NEXT: flat_load_dword v3, v[0:1] 1038; VI-NEXT: v_mov_b32_e32 v1, s1 1039; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1040; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1041; VI-NEXT: s_waitcnt vmcnt(0) 1042; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1043; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 1044; VI-NEXT: flat_store_dword v[0:1], v2 1045; VI-NEXT: s_endpgm 1046; 1047; CI-LABEL: v_insertelement_v2f16_0: 1048; CI: ; %bb.0: 1049; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1050; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1051; CI-NEXT: s_waitcnt lgkmcnt(0) 1052; CI-NEXT: v_mov_b32_e32 v1, s3 1053; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1054; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1055; CI-NEXT: flat_load_dword v3, v[0:1] 1056; CI-NEXT: v_mov_b32_e32 v1, s1 1057; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1058; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1059; CI-NEXT: s_waitcnt vmcnt(0) 1060; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1061; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 1062; CI-NEXT: flat_store_dword v[0:1], v2 1063; CI-NEXT: s_endpgm 1064; 1065; GFX11-LABEL: v_insertelement_v2f16_0: 1066; GFX11: ; %bb.0: 1067; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1068; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1069; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1070; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1071; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1072; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1073; GFX11-NEXT: s_movk_i32 s2, 0x4500 1074; GFX11-NEXT: s_waitcnt vmcnt(0) 1075; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 1076; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1077; GFX11-NEXT: s_endpgm 1078 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1079 %tid.ext = sext i32 %tid to i64 1080 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext 1081 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 1082 %vec = load <2 x half>, ptr addrspace(1) %in.gep 1083 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 1084 store <2 x half> %vecins, ptr addrspace(1) %out.gep 1085 ret void 1086} 1087 1088define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1089; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: 1090; GFX9: ; %bb.0: 1091; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1092; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1093; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1095; GFX9-NEXT: s_mov_b32 s2, 0xffff 1096; GFX9-NEXT: s_waitcnt vmcnt(0) 1097; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1 1098; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1099; GFX9-NEXT: s_endpgm 1100; 1101; VI-LABEL: v_insertelement_v2f16_0_inlineimm: 1102; VI: ; %bb.0: 1103; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1104; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1105; VI-NEXT: s_waitcnt lgkmcnt(0) 1106; VI-NEXT: v_mov_b32_e32 v1, s3 1107; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1108; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1109; VI-NEXT: flat_load_dword v3, v[0:1] 1110; VI-NEXT: v_mov_b32_e32 v1, s1 1111; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1112; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1113; VI-NEXT: s_waitcnt vmcnt(0) 1114; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1115; VI-NEXT: v_or_b32_e32 v2, 53, v2 1116; VI-NEXT: flat_store_dword v[0:1], v2 1117; VI-NEXT: s_endpgm 1118; 1119; CI-LABEL: v_insertelement_v2f16_0_inlineimm: 1120; CI: ; %bb.0: 1121; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1122; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1123; CI-NEXT: s_waitcnt lgkmcnt(0) 1124; CI-NEXT: v_mov_b32_e32 v1, s3 1125; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1126; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1127; CI-NEXT: flat_load_dword v3, v[0:1] 1128; CI-NEXT: v_mov_b32_e32 v1, s1 1129; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1130; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1131; CI-NEXT: s_waitcnt vmcnt(0) 1132; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1133; CI-NEXT: v_or_b32_e32 v2, 53, v2 1134; CI-NEXT: flat_store_dword v[0:1], v2 1135; CI-NEXT: s_endpgm 1136; 1137; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: 1138; GFX11: ; %bb.0: 1139; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1140; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1141; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1142; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1143; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1144; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1145; GFX11-NEXT: s_waitcnt vmcnt(0) 1146; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 1147; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1148; GFX11-NEXT: s_endpgm 1149 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1150 %tid.ext = sext i32 %tid to i64 1151 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext 1152 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 1153 %vec = load <2 x half>, ptr addrspace(1) %in.gep 1154 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0 1155 store <2 x half> %vecins, ptr addrspace(1) %out.gep 1156 ret void 1157} 1158 1159define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1160; GFX9-LABEL: v_insertelement_v2f16_1: 1161; GFX9: ; %bb.0: 1162; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1163; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1164; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 1165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1166; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1167; GFX9-NEXT: s_movk_i32 s2, 0x4500 1168; GFX9-NEXT: s_waitcnt vmcnt(0) 1169; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2 1170; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1171; GFX9-NEXT: s_endpgm 1172; 1173; VI-LABEL: v_insertelement_v2f16_1: 1174; VI: ; %bb.0: 1175; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1176; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1177; VI-NEXT: s_waitcnt lgkmcnt(0) 1178; VI-NEXT: v_mov_b32_e32 v1, s3 1179; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1180; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1181; VI-NEXT: flat_load_dword v3, v[0:1] 1182; VI-NEXT: v_mov_b32_e32 v1, s1 1183; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1184; VI-NEXT: v_mov_b32_e32 v2, 0x45000000 1185; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1186; VI-NEXT: s_waitcnt vmcnt(0) 1187; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1188; VI-NEXT: flat_store_dword v[0:1], v2 1189; VI-NEXT: s_endpgm 1190; 1191; CI-LABEL: v_insertelement_v2f16_1: 1192; CI: ; %bb.0: 1193; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1194; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1195; CI-NEXT: s_waitcnt lgkmcnt(0) 1196; CI-NEXT: v_mov_b32_e32 v1, s3 1197; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1198; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1199; CI-NEXT: flat_load_dword v3, v[0:1] 1200; CI-NEXT: v_mov_b32_e32 v1, s1 1201; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1202; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1203; CI-NEXT: s_waitcnt vmcnt(0) 1204; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 1205; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 1206; CI-NEXT: flat_store_dword v[0:1], v2 1207; CI-NEXT: s_endpgm 1208; 1209; GFX11-LABEL: v_insertelement_v2f16_1: 1210; GFX11: ; %bb.0: 1211; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1212; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1213; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1214; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1215; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1217; GFX11-NEXT: s_movk_i32 s2, 0x4500 1218; GFX11-NEXT: s_waitcnt vmcnt(0) 1219; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 1220; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1221; GFX11-NEXT: s_endpgm 1222 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1223 %tid.ext = sext i32 %tid to i64 1224 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext 1225 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 1226 %vec = load <2 x half>, ptr addrspace(1) %in.gep 1227 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 1228 store <2 x half> %vecins, ptr addrspace(1) %out.gep 1229 ret void 1230} 1231 1232define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1233; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: 1234; GFX9: ; %bb.0: 1235; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1236; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1237; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 1238; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1239; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1240; GFX9-NEXT: s_waitcnt vmcnt(0) 1241; GFX9-NEXT: v_perm_b32 v1, 35, v1, v2 1242; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1243; GFX9-NEXT: s_endpgm 1244; 1245; VI-LABEL: v_insertelement_v2f16_1_inlineimm: 1246; VI: ; %bb.0: 1247; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1248; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1249; VI-NEXT: s_waitcnt lgkmcnt(0) 1250; VI-NEXT: v_mov_b32_e32 v1, s3 1251; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1252; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1253; VI-NEXT: flat_load_dword v3, v[0:1] 1254; VI-NEXT: v_mov_b32_e32 v1, s1 1255; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1256; VI-NEXT: v_mov_b32_e32 v2, 0x230000 1257; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1258; VI-NEXT: s_waitcnt vmcnt(0) 1259; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1260; VI-NEXT: flat_store_dword v[0:1], v2 1261; VI-NEXT: s_endpgm 1262; 1263; CI-LABEL: v_insertelement_v2f16_1_inlineimm: 1264; CI: ; %bb.0: 1265; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1266; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1267; CI-NEXT: s_waitcnt lgkmcnt(0) 1268; CI-NEXT: v_mov_b32_e32 v1, s3 1269; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1270; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1271; CI-NEXT: flat_load_dword v3, v[0:1] 1272; CI-NEXT: v_mov_b32_e32 v1, s1 1273; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1274; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1275; CI-NEXT: s_waitcnt vmcnt(0) 1276; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 1277; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 1278; CI-NEXT: flat_store_dword v[0:1], v2 1279; CI-NEXT: s_endpgm 1280; 1281; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: 1282; GFX11: ; %bb.0: 1283; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1284; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1285; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1286; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1287; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1289; GFX11-NEXT: s_waitcnt vmcnt(0) 1290; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 1291; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1292; GFX11-NEXT: s_endpgm 1293 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1294 %tid.ext = sext i32 %tid to i64 1295 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext 1296 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 1297 %vec = load <2 x half>, ptr addrspace(1) %in.gep 1298 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1 1299 store <2 x half> %vecins, ptr addrspace(1) %out.gep 1300 ret void 1301} 1302 1303; FIXME: Enable for others when argument load not split 1304define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 { 1305; GFX9-LABEL: s_insertelement_v2i16_dynamic: 1306; GFX9: ; %bb.0: 1307; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1308; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1309; GFX9-NEXT: v_mov_b32_e32 v0, 0 1310; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1311; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 1312; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0 1313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1314; GFX9-NEXT: s_lshl_b32 s2, s6, 4 1315; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1316; GFX9-NEXT: s_andn2_b32 s3, s7, s2 1317; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 1318; GFX9-NEXT: s_or_b32 s2, s2, s3 1319; GFX9-NEXT: v_mov_b32_e32 v1, s2 1320; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1321; GFX9-NEXT: s_endpgm 1322; 1323; VI-LABEL: s_insertelement_v2i16_dynamic: 1324; VI: ; %bb.0: 1325; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1326; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1327; VI-NEXT: s_waitcnt lgkmcnt(0) 1328; VI-NEXT: s_load_dword s4, s[4:5], 0x0 1329; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1330; VI-NEXT: v_mov_b32_e32 v0, s0 1331; VI-NEXT: v_mov_b32_e32 v1, s1 1332; VI-NEXT: s_waitcnt lgkmcnt(0) 1333; VI-NEXT: s_lshl_b32 s0, s4, 4 1334; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1335; VI-NEXT: s_andn2_b32 s1, s2, s0 1336; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1337; VI-NEXT: s_or_b32 s0, s0, s1 1338; VI-NEXT: v_mov_b32_e32 v2, s0 1339; VI-NEXT: flat_store_dword v[0:1], v2 1340; VI-NEXT: s_endpgm 1341; 1342; CI-LABEL: s_insertelement_v2i16_dynamic: 1343; CI: ; %bb.0: 1344; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1345; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1346; CI-NEXT: s_waitcnt lgkmcnt(0) 1347; CI-NEXT: s_load_dword s4, s[4:5], 0x0 1348; CI-NEXT: s_load_dword s2, s[2:3], 0x0 1349; CI-NEXT: v_mov_b32_e32 v0, s0 1350; CI-NEXT: v_mov_b32_e32 v1, s1 1351; CI-NEXT: s_waitcnt lgkmcnt(0) 1352; CI-NEXT: s_lshl_b32 s0, s4, 4 1353; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1354; CI-NEXT: s_andn2_b32 s1, s2, s0 1355; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1356; CI-NEXT: s_or_b32 s0, s0, s1 1357; CI-NEXT: v_mov_b32_e32 v2, s0 1358; CI-NEXT: flat_store_dword v[0:1], v2 1359; CI-NEXT: s_endpgm 1360; 1361; GFX11-LABEL: s_insertelement_v2i16_dynamic: 1362; GFX11: ; %bb.0: 1363; GFX11-NEXT: s_clause 0x1 1364; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 1365; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1366; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 1368; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 1369; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX11-NEXT: s_lshl_b32 s3, s4, 4 1371; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1372; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3 1373; GFX11-NEXT: s_and_not1_b32 s2, s2, s3 1374; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7 1375; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1376; GFX11-NEXT: s_or_b32 s2, s3, s2 1377; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 1378; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1379; GFX11-NEXT: s_endpgm 1380 %idx = load volatile i32, ptr addrspace(4) %idx.ptr 1381 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr 1382 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1383 store <2 x i16> %vecins, ptr addrspace(1) %out 1384 ret void 1385} 1386 1387define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { 1388; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1389; GFX9: ; %bb.0: 1390; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1391; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 1392; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1393; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1394; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1395; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1396; GFX9-NEXT: s_lshl_b32 s2, s4, 4 1397; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1398; GFX9-NEXT: s_waitcnt vmcnt(0) 1399; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 1400; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1401; GFX9-NEXT: s_endpgm 1402; 1403; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1404; VI: ; %bb.0: 1405; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1406; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1407; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1408; VI-NEXT: s_waitcnt lgkmcnt(0) 1409; VI-NEXT: v_mov_b32_e32 v1, s3 1410; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1411; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1412; VI-NEXT: flat_load_dword v3, v[0:1] 1413; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1414; VI-NEXT: s_lshl_b32 s0, s4, 4 1415; VI-NEXT: v_mov_b32_e32 v1, s1 1416; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1417; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1418; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1419; VI-NEXT: s_waitcnt vmcnt(0) 1420; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1421; VI-NEXT: flat_store_dword v[0:1], v2 1422; VI-NEXT: s_endpgm 1423; 1424; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1425; CI: ; %bb.0: 1426; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1427; CI-NEXT: s_load_dword s4, s[8:9], 0x4 1428; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1429; CI-NEXT: s_waitcnt lgkmcnt(0) 1430; CI-NEXT: v_mov_b32_e32 v1, s3 1431; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1432; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1433; CI-NEXT: flat_load_dword v3, v[0:1] 1434; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1435; CI-NEXT: s_lshl_b32 s0, s4, 4 1436; CI-NEXT: v_mov_b32_e32 v1, s1 1437; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1438; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1439; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1440; CI-NEXT: s_waitcnt vmcnt(0) 1441; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 1442; CI-NEXT: flat_store_dword v[0:1], v2 1443; CI-NEXT: s_endpgm 1444; 1445; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1446; GFX11: ; %bb.0: 1447; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1448; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1449; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 1450; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 1451; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1452; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1454; GFX11-NEXT: s_lshl_b32 s2, s4, 4 1455; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s2 1456; GFX11-NEXT: s_waitcnt vmcnt(0) 1457; GFX11-NEXT: v_bfi_b32 v1, s2, 0x3e703e7, v1 1458; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1459; GFX11-NEXT: s_endpgm 1460 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1461 %tid.ext = sext i32 %tid to i64 1462 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 1463 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 1464 %vec = load <2 x i16>, ptr addrspace(1) %in.gep 1465 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1466 store <2 x i16> %vecins, ptr addrspace(1) %out.gep 1467 ret void 1468} 1469 1470define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { 1471; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1472; GFX9: ; %bb.0: 1473; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1474; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1475; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1476; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1477; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 1478; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1479; GFX9-NEXT: s_mov_b32 s2, 0xffff 1480; GFX9-NEXT: s_waitcnt vmcnt(1) 1481; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1482; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 1483; GFX9-NEXT: s_mov_b32 s2, 0x12341234 1484; GFX9-NEXT: s_waitcnt vmcnt(0) 1485; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2 1486; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1487; GFX9-NEXT: s_endpgm 1488; 1489; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1490; VI: ; %bb.0: 1491; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1492; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1493; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1494; VI-NEXT: s_waitcnt lgkmcnt(0) 1495; VI-NEXT: v_mov_b32_e32 v3, s3 1496; VI-NEXT: v_mov_b32_e32 v1, s5 1497; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1498; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1499; VI-NEXT: flat_load_dword v4, v[0:1] 1500; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1501; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1502; VI-NEXT: flat_load_dword v3, v[0:1] 1503; VI-NEXT: s_mov_b32 s2, 0xffff 1504; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1505; VI-NEXT: v_mov_b32_e32 v1, s1 1506; VI-NEXT: s_mov_b32 s0, 0x12341234 1507; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1508; VI-NEXT: s_waitcnt vmcnt(1) 1509; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 1510; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 1511; VI-NEXT: s_waitcnt vmcnt(0) 1512; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 1513; VI-NEXT: flat_store_dword v[0:1], v2 1514; VI-NEXT: s_endpgm 1515; 1516; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1517; CI: ; %bb.0: 1518; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1519; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1520; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1521; CI-NEXT: s_waitcnt lgkmcnt(0) 1522; CI-NEXT: v_mov_b32_e32 v3, s3 1523; CI-NEXT: v_mov_b32_e32 v1, s5 1524; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 1525; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1526; CI-NEXT: flat_load_dword v4, v[0:1] 1527; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1528; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1529; CI-NEXT: flat_load_dword v3, v[0:1] 1530; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1531; CI-NEXT: v_mov_b32_e32 v1, s1 1532; CI-NEXT: s_mov_b32 s0, 0x12341234 1533; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1534; CI-NEXT: s_waitcnt vmcnt(1) 1535; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 1536; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 1537; CI-NEXT: s_waitcnt vmcnt(0) 1538; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 1539; CI-NEXT: flat_store_dword v[0:1], v2 1540; CI-NEXT: s_endpgm 1541; 1542; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1543; GFX11: ; %bb.0: 1544; GFX11-NEXT: s_clause 0x1 1545; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 1546; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1547; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1548; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1549; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1550; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1551; GFX11-NEXT: s_clause 0x1 1552; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 1553; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] 1554; GFX11-NEXT: s_waitcnt vmcnt(1) 1555; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1556; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1557; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff 1558; GFX11-NEXT: s_waitcnt vmcnt(0) 1559; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2 1560; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1561; GFX11-NEXT: s_endpgm 1562 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1563 %tid.ext = sext i32 %tid to i64 1564 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext 1565 %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext 1566 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 1567 %idx = load i32, ptr addrspace(1) %idx.gep 1568 %vec = load <2 x half>, ptr addrspace(1) %in.gep 1569 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx 1570 store <2 x half> %vecins, ptr addrspace(1) %out.gep 1571 ret void 1572} 1573 1574define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { 1575; GFX9-LABEL: v_insertelement_v4f16_0: 1576; GFX9: ; %bb.0: 1577; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1578; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 1579; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1580; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1581; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1582; GFX9-NEXT: s_mov_b32 s2, 0xffff 1583; GFX9-NEXT: v_mov_b32_e32 v3, s4 1584; GFX9-NEXT: s_waitcnt vmcnt(0) 1585; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0 1586; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1587; GFX9-NEXT: s_endpgm 1588; 1589; VI-LABEL: v_insertelement_v4f16_0: 1590; VI: ; %bb.0: 1591; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1592; VI-NEXT: s_load_dword s4, s[8:9], 0x30 1593; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1594; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 1595; VI-NEXT: s_waitcnt lgkmcnt(0) 1596; VI-NEXT: v_mov_b32_e32 v1, s3 1597; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1598; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1599; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1600; VI-NEXT: v_mov_b32_e32 v3, s1 1601; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1602; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1603; VI-NEXT: s_waitcnt vmcnt(0) 1604; VI-NEXT: v_perm_b32 v0, s4, v0, v4 1605; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1606; VI-NEXT: s_endpgm 1607; 1608; CI-LABEL: v_insertelement_v4f16_0: 1609; CI: ; %bb.0: 1610; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1611; CI-NEXT: s_load_dword s4, s[8:9], 0xc 1612; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1613; CI-NEXT: s_waitcnt lgkmcnt(0) 1614; CI-NEXT: v_mov_b32_e32 v1, s3 1615; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1616; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1617; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1618; CI-NEXT: v_mov_b32_e32 v3, s1 1619; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1620; CI-NEXT: s_mov_b32 s0, 0xffff 1621; CI-NEXT: v_mov_b32_e32 v4, s4 1622; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1623; CI-NEXT: s_waitcnt vmcnt(0) 1624; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 1625; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1626; CI-NEXT: s_endpgm 1627; 1628; GFX11-LABEL: v_insertelement_v4f16_0: 1629; GFX11: ; %bb.0: 1630; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1631; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1632; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 1633; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1634; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1635; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1637; GFX11-NEXT: s_waitcnt vmcnt(0) 1638; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 1639; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1640; GFX11-NEXT: s_endpgm 1641 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1642 %tid.ext = sext i32 %tid to i64 1643 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext 1644 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext 1645 %vec = load <4 x half>, ptr addrspace(1) %in.gep 1646 %val.trunc = trunc i32 %val to i16 1647 %val.cvt = bitcast i16 %val.trunc to half 1648 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0 1649 store <4 x half> %vecins, ptr addrspace(1) %out.gep 1650 ret void 1651} 1652 1653define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { 1654; GFX9-LABEL: v_insertelement_v4f16_1: 1655; GFX9: ; %bb.0: 1656; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1657; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 1658; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1659; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 1660; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1661; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1662; GFX9-NEXT: s_waitcnt vmcnt(0) 1663; GFX9-NEXT: v_perm_b32 v0, s4, v0, v3 1664; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1665; GFX9-NEXT: s_endpgm 1666; 1667; VI-LABEL: v_insertelement_v4f16_1: 1668; VI: ; %bb.0: 1669; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1670; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1671; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1672; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 1673; VI-NEXT: s_waitcnt lgkmcnt(0) 1674; VI-NEXT: v_mov_b32_e32 v1, s3 1675; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1676; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1677; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1678; VI-NEXT: v_mov_b32_e32 v3, s1 1679; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1680; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1681; VI-NEXT: s_waitcnt vmcnt(0) 1682; VI-NEXT: v_perm_b32 v0, v0, s4, v4 1683; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1684; VI-NEXT: s_endpgm 1685; 1686; CI-LABEL: v_insertelement_v4f16_1: 1687; CI: ; %bb.0: 1688; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1689; CI-NEXT: s_load_dword s4, s[8:9], 0x4 1690; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1691; CI-NEXT: s_waitcnt lgkmcnt(0) 1692; CI-NEXT: v_mov_b32_e32 v1, s3 1693; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1694; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1695; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1696; CI-NEXT: v_mov_b32_e32 v3, s1 1697; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1698; CI-NEXT: s_lshl_b32 s0, s4, 16 1699; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1700; CI-NEXT: s_waitcnt vmcnt(0) 1701; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1702; CI-NEXT: v_or_b32_e32 v0, s0, v0 1703; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1704; CI-NEXT: s_endpgm 1705; 1706; GFX11-LABEL: v_insertelement_v4f16_1: 1707; GFX11: ; %bb.0: 1708; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1709; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1710; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 1711; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1712; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1713; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1714; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1715; GFX11-NEXT: s_waitcnt vmcnt(0) 1716; GFX11-NEXT: v_perm_b32 v0, s4, v0, 0x5040100 1717; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1718; GFX11-NEXT: s_endpgm 1719 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1720 %tid.ext = sext i32 %tid to i64 1721 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext 1722 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext 1723 %vec = load <4 x half>, ptr addrspace(1) %in.gep 1724 %val.trunc = trunc i32 %val to i16 1725 %val.cvt = bitcast i16 %val.trunc to half 1726 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1 1727 store <4 x half> %vecins, ptr addrspace(1) %out.gep 1728 ret void 1729} 1730 1731define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { 1732; GFX9-LABEL: v_insertelement_v4f16_2: 1733; GFX9: ; %bb.0: 1734; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1735; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 1736; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1739; GFX9-NEXT: s_mov_b32 s2, 0xffff 1740; GFX9-NEXT: v_mov_b32_e32 v3, s4 1741; GFX9-NEXT: s_waitcnt vmcnt(0) 1742; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 1743; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1744; GFX9-NEXT: s_endpgm 1745; 1746; VI-LABEL: v_insertelement_v4f16_2: 1747; VI: ; %bb.0: 1748; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1749; VI-NEXT: s_load_dword s4, s[8:9], 0x30 1750; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1751; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 1752; VI-NEXT: s_waitcnt lgkmcnt(0) 1753; VI-NEXT: v_mov_b32_e32 v1, s3 1754; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1755; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1756; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1757; VI-NEXT: v_mov_b32_e32 v3, s1 1758; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1759; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1760; VI-NEXT: s_waitcnt vmcnt(0) 1761; VI-NEXT: v_perm_b32 v1, s4, v1, v4 1762; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1763; VI-NEXT: s_endpgm 1764; 1765; CI-LABEL: v_insertelement_v4f16_2: 1766; CI: ; %bb.0: 1767; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1768; CI-NEXT: s_load_dword s4, s[8:9], 0xc 1769; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1770; CI-NEXT: s_waitcnt lgkmcnt(0) 1771; CI-NEXT: v_mov_b32_e32 v1, s3 1772; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1773; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1774; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1775; CI-NEXT: v_mov_b32_e32 v3, s1 1776; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1777; CI-NEXT: s_mov_b32 s0, 0xffff 1778; CI-NEXT: v_mov_b32_e32 v4, s4 1779; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1780; CI-NEXT: s_waitcnt vmcnt(0) 1781; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1782; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1783; CI-NEXT: s_endpgm 1784; 1785; GFX11-LABEL: v_insertelement_v4f16_2: 1786; GFX11: ; %bb.0: 1787; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1788; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1789; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 1790; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1791; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1792; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1794; GFX11-NEXT: s_waitcnt vmcnt(0) 1795; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 1796; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1797; GFX11-NEXT: s_endpgm 1798 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1799 %tid.ext = sext i32 %tid to i64 1800 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext 1801 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext 1802 %vec = load <4 x half>, ptr addrspace(1) %in.gep 1803 %val.trunc = trunc i32 %val to i16 1804 %val.cvt = bitcast i16 %val.trunc to half 1805 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2 1806 store <4 x half> %vecins, ptr addrspace(1) %out.gep 1807 ret void 1808} 1809 1810define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { 1811; GFX9-LABEL: v_insertelement_v4f16_3: 1812; GFX9: ; %bb.0: 1813; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1814; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 1815; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1816; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 1817; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1818; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1819; GFX9-NEXT: s_waitcnt vmcnt(0) 1820; GFX9-NEXT: v_perm_b32 v1, s4, v1, v3 1821; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1822; GFX9-NEXT: s_endpgm 1823; 1824; VI-LABEL: v_insertelement_v4f16_3: 1825; VI: ; %bb.0: 1826; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1827; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1828; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1829; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 1830; VI-NEXT: s_waitcnt lgkmcnt(0) 1831; VI-NEXT: v_mov_b32_e32 v1, s3 1832; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1833; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1834; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1835; VI-NEXT: v_mov_b32_e32 v3, s1 1836; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1837; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1838; VI-NEXT: s_waitcnt vmcnt(0) 1839; VI-NEXT: v_perm_b32 v1, v1, s4, v4 1840; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1841; VI-NEXT: s_endpgm 1842; 1843; CI-LABEL: v_insertelement_v4f16_3: 1844; CI: ; %bb.0: 1845; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1846; CI-NEXT: s_load_dword s4, s[8:9], 0x4 1847; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1848; CI-NEXT: s_waitcnt lgkmcnt(0) 1849; CI-NEXT: v_mov_b32_e32 v1, s3 1850; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1851; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1852; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1853; CI-NEXT: v_mov_b32_e32 v3, s1 1854; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1855; CI-NEXT: s_lshl_b32 s0, s4, 16 1856; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1857; CI-NEXT: s_waitcnt vmcnt(0) 1858; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1859; CI-NEXT: v_or_b32_e32 v1, s0, v1 1860; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1861; CI-NEXT: s_endpgm 1862; 1863; GFX11-LABEL: v_insertelement_v4f16_3: 1864; GFX11: ; %bb.0: 1865; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1866; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1867; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 1868; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1869; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1870; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1872; GFX11-NEXT: s_waitcnt vmcnt(0) 1873; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 1874; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1875; GFX11-NEXT: s_endpgm 1876 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1877 %tid.ext = sext i32 %tid to i64 1878 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext 1879 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext 1880 %vec = load <4 x half>, ptr addrspace(1) %in.gep 1881 %val.trunc = trunc i32 %val to i16 1882 %val.cvt = bitcast i16 %val.trunc to half 1883 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3 1884 store <4 x half> %vecins, ptr addrspace(1) %out.gep 1885 ret void 1886} 1887 1888define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { 1889; GFX9-LABEL: v_insertelement_v4i16_2: 1890; GFX9: ; %bb.0: 1891; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1892; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 1893; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1894; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1895; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1896; GFX9-NEXT: s_mov_b32 s2, 0xffff 1897; GFX9-NEXT: v_mov_b32_e32 v3, s4 1898; GFX9-NEXT: s_waitcnt vmcnt(0) 1899; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 1900; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1901; GFX9-NEXT: s_endpgm 1902; 1903; VI-LABEL: v_insertelement_v4i16_2: 1904; VI: ; %bb.0: 1905; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1906; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1907; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1908; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 1909; VI-NEXT: s_waitcnt lgkmcnt(0) 1910; VI-NEXT: v_mov_b32_e32 v1, s3 1911; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1912; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1913; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1914; VI-NEXT: v_mov_b32_e32 v3, s1 1915; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1916; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1917; VI-NEXT: s_waitcnt vmcnt(0) 1918; VI-NEXT: v_perm_b32 v1, s4, v1, v4 1919; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1920; VI-NEXT: s_endpgm 1921; 1922; CI-LABEL: v_insertelement_v4i16_2: 1923; CI: ; %bb.0: 1924; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1925; CI-NEXT: s_load_dword s4, s[8:9], 0x4 1926; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1927; CI-NEXT: s_waitcnt lgkmcnt(0) 1928; CI-NEXT: v_mov_b32_e32 v1, s3 1929; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1930; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1931; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1932; CI-NEXT: v_mov_b32_e32 v3, s1 1933; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1934; CI-NEXT: s_mov_b32 s0, 0xffff 1935; CI-NEXT: v_mov_b32_e32 v4, s4 1936; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1937; CI-NEXT: s_waitcnt vmcnt(0) 1938; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1939; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1940; CI-NEXT: s_endpgm 1941; 1942; GFX11-LABEL: v_insertelement_v4i16_2: 1943; GFX11: ; %bb.0: 1944; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1945; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1946; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 1947; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1948; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1949; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1950; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 1951; GFX11-NEXT: s_waitcnt vmcnt(0) 1952; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 1953; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1954; GFX11-NEXT: s_endpgm 1955 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1956 %tid.ext = sext i32 %tid to i64 1957 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext 1958 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext 1959 %vec = load <4 x i16>, ptr addrspace(1) %in.gep 1960 %val.trunc = trunc i32 %val to i16 1961 %val.cvt = bitcast i16 %val.trunc to i16 1962 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2 1963 store <4 x i16> %vecins, ptr addrspace(1) %out.gep 1964 ret void 1965} 1966 1967; FIXME: Better code on CI? 1968define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { 1969; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1970; GFX9: ; %bb.0: 1971; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1972; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 1973; GFX9-NEXT: global_load_dword v2, v[0:1], off glc 1974; GFX9-NEXT: s_waitcnt vmcnt(0) 1975; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1976; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1977; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1978; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff 1979; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1980; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] 1981; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 1982; GFX9-NEXT: s_waitcnt vmcnt(0) 1983; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 1984; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 1985; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1986; GFX9-NEXT: s_endpgm 1987; 1988; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1989; VI: ; %bb.0: 1990; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1991; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1992; VI-NEXT: flat_load_dword v4, v[0:1] glc 1993; VI-NEXT: s_waitcnt vmcnt(0) 1994; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1995; VI-NEXT: s_waitcnt lgkmcnt(0) 1996; VI-NEXT: v_mov_b32_e32 v1, s3 1997; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1998; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1999; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2000; VI-NEXT: s_mov_b64 s[2:3], 0xffff 2001; VI-NEXT: v_mov_b32_e32 v3, s1 2002; VI-NEXT: s_lshl_b32 s1, s4, 16 2003; VI-NEXT: s_and_b32 s4, s4, 0xffff 2004; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 2005; VI-NEXT: s_or_b32 s0, s4, s1 2006; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2007; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 2008; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] 2009; VI-NEXT: s_waitcnt vmcnt(0) 2010; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 2011; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 2012; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2013; VI-NEXT: s_endpgm 2014; 2015; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 2016; CI: ; %bb.0: 2017; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2018; CI-NEXT: s_load_dword s4, s[8:9], 0x4 2019; CI-NEXT: flat_load_dword v4, v[0:1] glc 2020; CI-NEXT: s_waitcnt vmcnt(0) 2021; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2022; CI-NEXT: s_waitcnt lgkmcnt(0) 2023; CI-NEXT: v_mov_b32_e32 v1, s3 2024; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2025; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2026; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2027; CI-NEXT: s_mov_b64 s[2:3], 0xffff 2028; CI-NEXT: v_mov_b32_e32 v3, s1 2029; CI-NEXT: s_lshl_b32 s1, s4, 16 2030; CI-NEXT: s_and_b32 s4, s4, 0xffff 2031; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 2032; CI-NEXT: s_or_b32 s0, s4, s1 2033; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2034; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 2035; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 2036; CI-NEXT: s_waitcnt vmcnt(0) 2037; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 2038; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 2039; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2040; CI-NEXT: s_endpgm 2041; 2042; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: 2043; GFX11: ; %bb.0: 2044; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2045; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc 2046; GFX11-NEXT: s_waitcnt vmcnt(0) 2047; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2048; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 2049; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2050; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 2051; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2052; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] 2053; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s4 2054; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 2055; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff 2056; GFX11-NEXT: s_waitcnt vmcnt(0) 2057; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 2058; GFX11-NEXT: v_bfi_b32 v1, v3, s2, v1 2059; GFX11-NEXT: v_bfi_b32 v0, v2, s2, v0 2060; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 2061; GFX11-NEXT: s_endpgm 2062 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2063 %tid.ext = sext i32 %tid to i64 2064 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext 2065 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext 2066 %idx.val = load volatile i32, ptr addrspace(1) undef 2067 %vec = load <4 x i16>, ptr addrspace(1) %in.gep 2068 %val.trunc = trunc i32 %val to i16 2069 %val.cvt = bitcast i16 %val.trunc to i16 2070 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val 2071 store <4 x i16> %vecins, ptr addrspace(1) %out.gep 2072 ret void 2073} 2074 2075define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { 2076; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2077; GFX9: ; %bb.0: 2078; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2079; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2080; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2081; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2082; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 2083; GFX9-NEXT: s_lshl_b32 s2, s5, 4 2084; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 2085; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 2086; GFX9-NEXT: v_mov_b32_e32 v3, s4 2087; GFX9-NEXT: v_mov_b32_e32 v4, s4 2088; GFX9-NEXT: s_waitcnt vmcnt(0) 2089; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 2090; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 2091; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2092; GFX9-NEXT: s_endpgm 2093; 2094; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2095; VI: ; %bb.0: 2096; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2097; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2098; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2099; VI-NEXT: s_waitcnt lgkmcnt(0) 2100; VI-NEXT: v_mov_b32_e32 v1, s3 2101; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2102; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2103; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2104; VI-NEXT: v_mov_b32_e32 v3, s1 2105; VI-NEXT: s_lshl_b32 s1, s4, 16 2106; VI-NEXT: s_and_b32 s2, s4, 0xffff 2107; VI-NEXT: s_lshl_b32 s3, s5, 4 2108; VI-NEXT: s_or_b32 s2, s2, s1 2109; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 2110; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 2111; VI-NEXT: v_mov_b32_e32 v4, s2 2112; VI-NEXT: v_mov_b32_e32 v5, s2 2113; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2114; VI-NEXT: s_waitcnt vmcnt(0) 2115; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 2116; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 2117; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2118; VI-NEXT: s_endpgm 2119; 2120; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2121; CI: ; %bb.0: 2122; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2123; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 2124; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2125; CI-NEXT: s_waitcnt lgkmcnt(0) 2126; CI-NEXT: v_mov_b32_e32 v1, s3 2127; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2128; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2129; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2130; CI-NEXT: v_mov_b32_e32 v3, s1 2131; CI-NEXT: s_and_b32 s1, s4, 0xffff 2132; CI-NEXT: s_lshl_b32 s2, s4, 16 2133; CI-NEXT: s_lshl_b32 s3, s5, 4 2134; CI-NEXT: s_or_b32 s2, s1, s2 2135; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 2136; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 2137; CI-NEXT: v_mov_b32_e32 v4, s2 2138; CI-NEXT: v_mov_b32_e32 v5, s2 2139; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2140; CI-NEXT: s_waitcnt vmcnt(0) 2141; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 2142; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 2143; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2144; CI-NEXT: s_endpgm 2145; 2146; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2147; GFX11: ; %bb.0: 2148; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2149; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2150; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 2151; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2152; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2153; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2154; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 2155; GFX11-NEXT: s_lshl_b32 s2, s5, 4 2156; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s4 2157; GFX11-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 2158; GFX11-NEXT: s_waitcnt vmcnt(0) 2159; GFX11-NEXT: v_bfi_b32 v1, s3, s4, v1 2160; GFX11-NEXT: v_bfi_b32 v0, s2, s4, v0 2161; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 2162; GFX11-NEXT: s_endpgm 2163 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2164 %tid.ext = sext i32 %tid to i64 2165 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext 2166 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext 2167 %vec = load <4 x half>, ptr addrspace(1) %in.gep 2168 %val.trunc = trunc i32 %val to i16 2169 %val.cvt = bitcast i16 %val.trunc to half 2170 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval 2171 store <4 x half> %vecins, ptr addrspace(1) %out.gep 2172 ret void 2173} 2174 2175define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { 2176; GFX9-LABEL: v_insertelement_v8f16_3: 2177; GFX9: ; %bb.0: 2178; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2179; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 2180; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2181; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 2182; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2183; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 2184; GFX9-NEXT: s_waitcnt vmcnt(0) 2185; GFX9-NEXT: v_perm_b32 v1, s4, v1, v5 2186; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2187; GFX9-NEXT: s_endpgm 2188; 2189; VI-LABEL: v_insertelement_v8f16_3: 2190; VI: ; %bb.0: 2191; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2192; VI-NEXT: s_load_dword s4, s[8:9], 0x10 2193; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2194; VI-NEXT: s_waitcnt lgkmcnt(0) 2195; VI-NEXT: v_mov_b32_e32 v1, s3 2196; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2197; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2198; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2199; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 2200; VI-NEXT: s_lshl_b32 s0, s4, 16 2201; VI-NEXT: v_mov_b32_e32 v5, s1 2202; VI-NEXT: v_mov_b32_e32 v6, s0 2203; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2204; VI-NEXT: s_waitcnt vmcnt(0) 2205; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2206; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2207; VI-NEXT: s_endpgm 2208; 2209; CI-LABEL: v_insertelement_v8f16_3: 2210; CI: ; %bb.0: 2211; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2212; CI-NEXT: s_load_dword s4, s[8:9], 0x4 2213; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2214; CI-NEXT: s_waitcnt lgkmcnt(0) 2215; CI-NEXT: v_mov_b32_e32 v1, s3 2216; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2217; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2218; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2219; CI-NEXT: v_mov_b32_e32 v5, s1 2220; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2221; CI-NEXT: s_lshl_b32 s0, s4, 16 2222; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2223; CI-NEXT: s_waitcnt vmcnt(0) 2224; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 2225; CI-NEXT: v_or_b32_e32 v1, s0, v1 2226; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2227; CI-NEXT: s_endpgm 2228; 2229; GFX11-LABEL: v_insertelement_v8f16_3: 2230; GFX11: ; %bb.0: 2231; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2232; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2233; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 2234; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2235; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2236; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2237; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 2238; GFX11-NEXT: s_waitcnt vmcnt(0) 2239; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 2240; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 2241; GFX11-NEXT: s_endpgm 2242 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2243 %tid.ext = sext i32 %tid to i64 2244 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext 2245 %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext 2246 %vec = load <8 x half>, ptr addrspace(1) %in.gep 2247 %val.trunc = trunc i32 %val to i16 2248 %val.cvt = bitcast i16 %val.trunc to half 2249 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3 2250 store <8 x half> %vecins, ptr addrspace(1) %out.gep 2251 ret void 2252} 2253 2254define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { 2255; GFX9-LABEL: v_insertelement_v8i16_6: 2256; GFX9: ; %bb.0: 2257; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2258; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 2259; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 2262; GFX9-NEXT: s_mov_b32 s2, 0xffff 2263; GFX9-NEXT: v_mov_b32_e32 v5, s4 2264; GFX9-NEXT: s_waitcnt vmcnt(0) 2265; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3 2266; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2267; GFX9-NEXT: s_endpgm 2268; 2269; VI-LABEL: v_insertelement_v8i16_6: 2270; VI: ; %bb.0: 2271; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2272; VI-NEXT: s_load_dword s4, s[8:9], 0x10 2273; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2274; VI-NEXT: s_waitcnt lgkmcnt(0) 2275; VI-NEXT: v_mov_b32_e32 v1, s3 2276; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2277; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2278; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2279; VI-NEXT: v_mov_b32_e32 v5, s1 2280; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 2281; VI-NEXT: s_mov_b32 s0, 0xffff 2282; VI-NEXT: v_mov_b32_e32 v6, s4 2283; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2284; VI-NEXT: s_waitcnt vmcnt(0) 2285; VI-NEXT: v_bfi_b32 v3, s0, v6, v3 2286; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2287; VI-NEXT: s_endpgm 2288; 2289; CI-LABEL: v_insertelement_v8i16_6: 2290; CI: ; %bb.0: 2291; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2292; CI-NEXT: s_load_dword s4, s[8:9], 0x4 2293; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2294; CI-NEXT: s_waitcnt lgkmcnt(0) 2295; CI-NEXT: v_mov_b32_e32 v1, s3 2296; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2297; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2298; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2299; CI-NEXT: v_mov_b32_e32 v5, s1 2300; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2301; CI-NEXT: s_mov_b32 s0, 0xffff 2302; CI-NEXT: v_mov_b32_e32 v6, s4 2303; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2304; CI-NEXT: s_waitcnt vmcnt(0) 2305; CI-NEXT: v_bfi_b32 v3, s0, v6, v3 2306; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2307; CI-NEXT: s_endpgm 2308; 2309; GFX11-LABEL: v_insertelement_v8i16_6: 2310; GFX11: ; %bb.0: 2311; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2312; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2313; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 2314; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2315; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2316; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 2318; GFX11-NEXT: s_waitcnt vmcnt(0) 2319; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 2320; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 2321; GFX11-NEXT: s_endpgm 2322 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2323 %tid.ext = sext i32 %tid to i64 2324 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext 2325 %out.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %out, i64 %tid.ext 2326 %vec = load <8 x i16>, ptr addrspace(1) %in.gep 2327 %val.trunc = trunc i32 %val to i16 2328 %val.cvt = bitcast i16 %val.trunc to i16 2329 %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6 2330 store <8 x i16> %vecins, ptr addrspace(1) %out.gep 2331 ret void 2332} 2333 2334define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { 2335; GFX9-LABEL: v_insertelement_v8f16_dynamic: 2336; GFX9: ; %bb.0: 2337; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2338; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2339; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2340; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2341; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 2342; GFX9-NEXT: s_cmp_eq_u32 s5, 6 2343; GFX9-NEXT: v_mov_b32_e32 v5, s4 2344; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2345; GFX9-NEXT: s_cmp_eq_u32 s5, 7 2346; GFX9-NEXT: s_mov_b32 s2, 0x5040100 2347; GFX9-NEXT: s_waitcnt vmcnt(0) 2348; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc 2349; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2350; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2351; GFX9-NEXT: s_cmp_eq_u32 s5, 4 2352; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2353; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2354; GFX9-NEXT: s_cmp_eq_u32 s5, 5 2355; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 2356; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2357; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2358; GFX9-NEXT: s_cmp_eq_u32 s5, 2 2359; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 2360; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc 2361; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2362; GFX9-NEXT: s_cmp_eq_u32 s5, 3 2363; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2364; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2365; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2366; GFX9-NEXT: s_cmp_eq_u32 s5, 0 2367; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 2368; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc 2369; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2370; GFX9-NEXT: s_cmp_eq_u32 s5, 1 2371; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 2372; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 2373; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2374; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc 2375; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2 2376; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2 2377; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2378; GFX9-NEXT: s_endpgm 2379; 2380; VI-LABEL: v_insertelement_v8f16_dynamic: 2381; VI: ; %bb.0: 2382; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2383; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2384; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2385; VI-NEXT: s_waitcnt lgkmcnt(0) 2386; VI-NEXT: v_mov_b32_e32 v1, s3 2387; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2388; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2389; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2390; VI-NEXT: v_mov_b32_e32 v5, s1 2391; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 2392; VI-NEXT: s_cmp_eq_u32 s5, 6 2393; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2394; VI-NEXT: v_mov_b32_e32 v6, s4 2395; VI-NEXT: s_cselect_b64 vcc, -1, 0 2396; VI-NEXT: s_cmp_eq_u32 s5, 7 2397; VI-NEXT: s_waitcnt vmcnt(0) 2398; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc 2399; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2400; VI-NEXT: s_cselect_b64 vcc, -1, 0 2401; VI-NEXT: s_cmp_eq_u32 s5, 4 2402; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2403; VI-NEXT: s_cselect_b64 vcc, -1, 0 2404; VI-NEXT: s_cmp_eq_u32 s5, 5 2405; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2406; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2407; VI-NEXT: s_cselect_b64 vcc, -1, 0 2408; VI-NEXT: s_cmp_eq_u32 s5, 2 2409; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2410; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 2411; VI-NEXT: s_cselect_b64 vcc, -1, 0 2412; VI-NEXT: s_cmp_eq_u32 s5, 3 2413; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2414; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2415; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 2416; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2417; VI-NEXT: s_cselect_b64 vcc, -1, 0 2418; VI-NEXT: s_cmp_eq_u32 s5, 0 2419; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2420; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc 2421; VI-NEXT: s_cselect_b64 vcc, -1, 0 2422; VI-NEXT: s_cmp_eq_u32 s5, 1 2423; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 2424; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2425; VI-NEXT: s_cselect_b64 vcc, -1, 0 2426; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 2427; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2428; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2429; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2430; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2431; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2432; VI-NEXT: s_endpgm 2433; 2434; CI-LABEL: v_insertelement_v8f16_dynamic: 2435; CI: ; %bb.0: 2436; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2437; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 2438; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2439; CI-NEXT: s_waitcnt lgkmcnt(0) 2440; CI-NEXT: v_mov_b32_e32 v1, s3 2441; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2442; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2443; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2444; CI-NEXT: v_mov_b32_e32 v5, s1 2445; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2446; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 2447; CI-NEXT: s_cmp_eq_u32 s5, 7 2448; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2449; CI-NEXT: s_cselect_b64 vcc, -1, 0 2450; CI-NEXT: s_cmp_eq_u32 s5, 6 2451; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2452; CI-NEXT: s_cmp_eq_u32 s5, 5 2453; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 2454; CI-NEXT: s_cmp_eq_u32 s5, 4 2455; CI-NEXT: s_waitcnt vmcnt(0) 2456; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 2457; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2458; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2459; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 2460; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 2461; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2462; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2463; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 2464; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 2465; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 2466; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2467; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2468; CI-NEXT: s_cmp_eq_u32 s5, 3 2469; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2470; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2471; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 2472; CI-NEXT: s_cselect_b64 vcc, -1, 0 2473; CI-NEXT: s_cmp_eq_u32 s5, 2 2474; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc 2475; CI-NEXT: s_cselect_b64 vcc, -1, 0 2476; CI-NEXT: s_cmp_eq_u32 s5, 1 2477; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2478; CI-NEXT: s_cselect_b64 vcc, -1, 0 2479; CI-NEXT: s_cmp_eq_u32 s5, 0 2480; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] 2481; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2482; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc 2483; CI-NEXT: s_cselect_b64 vcc, -1, 0 2484; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2485; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2486; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 2487; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 2488; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 2489; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2490; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2491; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2492; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2493; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 2494; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 2495; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 2496; CI-NEXT: v_or_b32_e32 v3, v3, v6 2497; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 2498; CI-NEXT: v_or_b32_e32 v2, v2, v7 2499; CI-NEXT: v_or_b32_e32 v1, v1, v8 2500; CI-NEXT: v_or_b32_e32 v0, v0, v6 2501; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2502; CI-NEXT: s_endpgm 2503; 2504; GFX11-LABEL: v_insertelement_v8f16_dynamic: 2505; GFX11: ; %bb.0: 2506; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2507; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2508; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 2509; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2510; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2511; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 2513; GFX11-NEXT: s_cmp_eq_u32 s5, 6 2514; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2515; GFX11-NEXT: s_cmp_eq_u32 s5, 7 2516; GFX11-NEXT: s_waitcnt vmcnt(0) 2517; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2 2518; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2519; GFX11-NEXT: s_cmp_eq_u32 s5, 4 2520; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2521; GFX11-NEXT: s_cselect_b32 s3, -1, 0 2522; GFX11-NEXT: s_cmp_eq_u32 s5, 5 2523; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2524; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 2525; GFX11-NEXT: s_cselect_b32 s3, -1, 0 2526; GFX11-NEXT: s_cmp_eq_u32 s5, 2 2527; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 2528; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 2529; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2530; GFX11-NEXT: s_cmp_eq_u32 s5, 3 2531; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 2532; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2533; GFX11-NEXT: s_cmp_eq_u32 s5, 0 2534; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 2535; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 2536; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2537; GFX11-NEXT: s_cmp_eq_u32 s5, 1 2538; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 2539; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2540; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3 2541; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2 2542; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 2543; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 2544; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2545; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 2546; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 2547; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 2548; GFX11-NEXT: s_endpgm 2549 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2550 %tid.ext = sext i32 %tid to i64 2551 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext 2552 %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext 2553 %vec = load <8 x half>, ptr addrspace(1) %in.gep 2554 %val.trunc = trunc i32 %val to i16 2555 %val.cvt = bitcast i16 %val.trunc to half 2556 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n 2557 store <8 x half> %vecins, ptr addrspace(1) %out.gep 2558 ret void 2559} 2560 2561define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { 2562; GFX9-LABEL: v_insertelement_v16f16_3: 2563; GFX9: ; %bb.0: 2564; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2565; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 2566; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2567; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 2568; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2569; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 2570; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 2571; GFX9-NEXT: s_waitcnt vmcnt(1) 2572; GFX9-NEXT: v_perm_b32 v1, s4, v1, v9 2573; GFX9-NEXT: s_waitcnt vmcnt(0) 2574; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 2575; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 2576; GFX9-NEXT: s_endpgm 2577; 2578; VI-LABEL: v_insertelement_v16f16_3: 2579; VI: ; %bb.0: 2580; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2581; VI-NEXT: s_load_dword s4, s[8:9], 0x10 2582; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2583; VI-NEXT: s_waitcnt lgkmcnt(0) 2584; VI-NEXT: v_mov_b32_e32 v1, s3 2585; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 2586; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2587; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 2588; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2589; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2590; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2591; VI-NEXT: v_mov_b32_e32 v9, s1 2592; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 2593; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2594; VI-NEXT: s_lshl_b32 s1, s4, 16 2595; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 2596; VI-NEXT: v_mov_b32_e32 v12, s1 2597; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2598; VI-NEXT: s_waitcnt vmcnt(1) 2599; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2600; VI-NEXT: s_waitcnt vmcnt(0) 2601; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2602; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2603; VI-NEXT: s_endpgm 2604; 2605; CI-LABEL: v_insertelement_v16f16_3: 2606; CI: ; %bb.0: 2607; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2608; CI-NEXT: s_load_dword s4, s[8:9], 0x4 2609; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2610; CI-NEXT: s_waitcnt lgkmcnt(0) 2611; CI-NEXT: v_mov_b32_e32 v0, s3 2612; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 2613; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc 2614; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 2615; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 2616; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2617; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2618; CI-NEXT: v_mov_b32_e32 v9, s1 2619; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 2620; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2621; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8 2622; CI-NEXT: s_lshl_b32 s1, s4, 16 2623; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2624; CI-NEXT: s_waitcnt vmcnt(1) 2625; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 2626; CI-NEXT: v_or_b32_e32 v1, s1, v1 2627; CI-NEXT: s_waitcnt vmcnt(0) 2628; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2629; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2630; CI-NEXT: s_endpgm 2631; 2632; GFX11-LABEL: v_insertelement_v16f16_3: 2633; GFX11: ; %bb.0: 2634; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2635; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2636; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 2637; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2638; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2639; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2640; GFX11-NEXT: s_clause 0x1 2641; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] 2642; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 2643; GFX11-NEXT: s_waitcnt vmcnt(1) 2644; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 2645; GFX11-NEXT: s_waitcnt vmcnt(0) 2646; GFX11-NEXT: s_clause 0x1 2647; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 2648; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 2649; GFX11-NEXT: s_endpgm 2650 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2651 %tid.ext = sext i32 %tid to i64 2652 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext 2653 %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext 2654 %vec = load <16 x half>, ptr addrspace(1) %in.gep 2655 %val.trunc = trunc i32 %val to i16 2656 %val.cvt = bitcast i16 %val.trunc to half 2657 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3 2658 store <16 x half> %vecins, ptr addrspace(1) %out.gep 2659 ret void 2660} 2661 2662define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { 2663; GFX9-LABEL: v_insertelement_v16i16_6: 2664; GFX9: ; %bb.0: 2665; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2666; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 2667; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2668; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2669; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 2670; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 2671; GFX9-NEXT: s_mov_b32 s2, 0xffff 2672; GFX9-NEXT: v_mov_b32_e32 v9, s4 2673; GFX9-NEXT: s_waitcnt vmcnt(1) 2674; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3 2675; GFX9-NEXT: s_waitcnt vmcnt(0) 2676; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 2677; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 2678; GFX9-NEXT: s_endpgm 2679; 2680; VI-LABEL: v_insertelement_v16i16_6: 2681; VI: ; %bb.0: 2682; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2683; VI-NEXT: s_load_dword s4, s[8:9], 0x10 2684; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2685; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 2686; VI-NEXT: s_waitcnt lgkmcnt(0) 2687; VI-NEXT: v_mov_b32_e32 v1, s3 2688; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 2689; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2690; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 2691; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2692; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2693; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2694; VI-NEXT: v_mov_b32_e32 v9, s1 2695; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 2696; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2697; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 2698; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2699; VI-NEXT: s_waitcnt vmcnt(1) 2700; VI-NEXT: v_perm_b32 v3, s4, v3, v12 2701; VI-NEXT: s_waitcnt vmcnt(0) 2702; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2703; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2704; VI-NEXT: s_endpgm 2705; 2706; CI-LABEL: v_insertelement_v16i16_6: 2707; CI: ; %bb.0: 2708; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2709; CI-NEXT: s_load_dword s4, s[8:9], 0x4 2710; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2711; CI-NEXT: s_waitcnt lgkmcnt(0) 2712; CI-NEXT: v_mov_b32_e32 v1, s3 2713; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 2714; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2715; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 2716; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2717; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2718; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2719; CI-NEXT: v_mov_b32_e32 v9, s1 2720; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 2721; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2722; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8 2723; CI-NEXT: s_mov_b32 s2, 0xffff 2724; CI-NEXT: v_mov_b32_e32 v12, s4 2725; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2726; CI-NEXT: s_waitcnt vmcnt(1) 2727; CI-NEXT: v_bfi_b32 v3, s2, v12, v3 2728; CI-NEXT: s_waitcnt vmcnt(0) 2729; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2730; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2731; CI-NEXT: s_endpgm 2732; 2733; GFX11-LABEL: v_insertelement_v16i16_6: 2734; GFX11: ; %bb.0: 2735; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2736; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2737; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 2738; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2739; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2740; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2741; GFX11-NEXT: s_clause 0x1 2742; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] 2743; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 2744; GFX11-NEXT: s_waitcnt vmcnt(1) 2745; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 2746; GFX11-NEXT: s_waitcnt vmcnt(0) 2747; GFX11-NEXT: s_clause 0x1 2748; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 2749; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 2750; GFX11-NEXT: s_endpgm 2751 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2752 %tid.ext = sext i32 %tid to i64 2753 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext 2754 %out.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %out, i64 %tid.ext 2755 %vec = load <16 x i16>, ptr addrspace(1) %in.gep 2756 %val.trunc = trunc i32 %val to i16 2757 %val.cvt = bitcast i16 %val.trunc to i16 2758 %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6 2759 store <16 x i16> %vecins, ptr addrspace(1) %out.gep 2760 ret void 2761} 2762 2763define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { 2764; GFX9-LABEL: v_insertelement_v16f16_dynamic: 2765; GFX9: ; %bb.0: 2766; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2767; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2768; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 2769; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2770; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] 2771; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 2772; GFX9-NEXT: s_cmp_eq_u32 s5, 6 2773; GFX9-NEXT: v_mov_b32_e32 v9, s4 2774; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2775; GFX9-NEXT: s_cmp_eq_u32 s5, 7 2776; GFX9-NEXT: s_mov_b32 s2, 0x5040100 2777; GFX9-NEXT: s_waitcnt vmcnt(1) 2778; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc 2779; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 2780; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2781; GFX9-NEXT: s_cmp_eq_u32 s5, 4 2782; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc 2783; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2784; GFX9-NEXT: s_cmp_eq_u32 s5, 5 2785; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 2786; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc 2787; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2788; GFX9-NEXT: s_cmp_eq_u32 s5, 2 2789; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 2790; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc 2791; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2792; GFX9-NEXT: s_cmp_eq_u32 s5, 3 2793; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 2794; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc 2795; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2796; GFX9-NEXT: s_cmp_eq_u32 s5, 0 2797; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc 2798; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2799; GFX9-NEXT: s_cmp_eq_u32 s5, 1 2800; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 2801; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 2802; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2803; GFX9-NEXT: s_cmp_eq_u32 s5, 14 2804; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc 2805; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2806; GFX9-NEXT: s_cmp_eq_u32 s5, 15 2807; GFX9-NEXT: s_waitcnt vmcnt(0) 2808; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 2809; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 2810; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2811; GFX9-NEXT: s_cmp_eq_u32 s5, 12 2812; GFX9-NEXT: v_perm_b32 v1, v12, v1, s2 2813; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc 2814; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2815; GFX9-NEXT: s_cmp_eq_u32 s5, 13 2816; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 2817; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 2818; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2819; GFX9-NEXT: s_cmp_eq_u32 s5, 10 2820; GFX9-NEXT: v_perm_b32 v8, v12, v8, s2 2821; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc 2822; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2823; GFX9-NEXT: s_cmp_eq_u32 s5, 11 2824; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 2825; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6 2826; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc 2827; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2828; GFX9-NEXT: s_cmp_eq_u32 s5, 8 2829; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc 2830; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2831; GFX9-NEXT: s_cmp_eq_u32 s5, 9 2832; GFX9-NEXT: v_perm_b32 v2, v11, v2, s2 2833; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v5 2834; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc 2835; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2836; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc 2837; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 2838; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 2839; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 2840; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 2841; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] 2842; GFX9-NEXT: s_endpgm 2843; 2844; VI-LABEL: v_insertelement_v16f16_dynamic: 2845; VI: ; %bb.0: 2846; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2847; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 2848; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2849; VI-NEXT: s_waitcnt lgkmcnt(0) 2850; VI-NEXT: v_mov_b32_e32 v0, s3 2851; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 2852; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc 2853; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 2854; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 2855; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2856; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2857; VI-NEXT: v_mov_b32_e32 v9, s1 2858; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 2859; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2860; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 2861; VI-NEXT: s_cmp_eq_u32 s7, 14 2862; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2863; VI-NEXT: v_mov_b32_e32 v12, s6 2864; VI-NEXT: s_cselect_b64 vcc, -1, 0 2865; VI-NEXT: s_cmp_eq_u32 s7, 15 2866; VI-NEXT: s_waitcnt vmcnt(1) 2867; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc 2868; VI-NEXT: s_cselect_b64 vcc, -1, 0 2869; VI-NEXT: s_cmp_eq_u32 s7, 12 2870; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 2871; VI-NEXT: s_cmp_eq_u32 s7, 13 2872; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 2873; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] 2874; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 2875; VI-NEXT: s_cmp_eq_u32 s7, 10 2876; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 2877; VI-NEXT: s_cmp_eq_u32 s7, 11 2878; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 2879; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] 2880; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 2881; VI-NEXT: s_cmp_eq_u32 s7, 8 2882; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2883; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 2884; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] 2885; VI-NEXT: s_cmp_eq_u32 s7, 9 2886; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 2887; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc 2888; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2889; VI-NEXT: s_cselect_b64 vcc, -1, 0 2890; VI-NEXT: s_cmp_eq_u32 s7, 6 2891; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2892; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc 2893; VI-NEXT: s_cselect_b64 vcc, -1, 0 2894; VI-NEXT: s_cmp_eq_u32 s7, 7 2895; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] 2896; VI-NEXT: s_waitcnt vmcnt(0) 2897; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 2898; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2899; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] 2900; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2901; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc 2902; VI-NEXT: s_cselect_b64 vcc, -1, 0 2903; VI-NEXT: s_cmp_eq_u32 s7, 4 2904; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2905; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 2906; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2907; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc 2908; VI-NEXT: s_cselect_b64 vcc, -1, 0 2909; VI-NEXT: s_cmp_eq_u32 s7, 5 2910; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2911; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 2912; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc 2913; VI-NEXT: s_cselect_b64 vcc, -1, 0 2914; VI-NEXT: s_cmp_eq_u32 s7, 2 2915; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc 2916; VI-NEXT: s_cselect_b64 vcc, -1, 0 2917; VI-NEXT: s_cmp_eq_u32 s7, 3 2918; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 2919; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc 2920; VI-NEXT: s_cselect_b64 vcc, -1, 0 2921; VI-NEXT: s_cmp_eq_u32 s7, 0 2922; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 2923; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc 2924; VI-NEXT: s_cselect_b64 vcc, -1, 0 2925; VI-NEXT: s_cmp_eq_u32 s7, 1 2926; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2927; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 2928; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc 2929; VI-NEXT: s_cselect_b64 vcc, -1, 0 2930; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc 2931; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2932; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 2933; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 2934; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2935; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2936; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2937; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 2938; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] 2939; VI-NEXT: s_endpgm 2940; 2941; CI-LABEL: v_insertelement_v16f16_dynamic: 2942; CI: ; %bb.0: 2943; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2944; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 2945; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 2946; CI-NEXT: s_waitcnt lgkmcnt(0) 2947; CI-NEXT: v_mov_b32_e32 v1, s3 2948; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2949; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2950; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 2951; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 2952; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] 2953; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2954; CI-NEXT: v_mov_b32_e32 v5, s1 2955; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2956; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 2957; CI-NEXT: s_cmp_eq_u32 s5, 15 2958; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2959; CI-NEXT: s_cselect_b64 vcc, -1, 0 2960; CI-NEXT: s_cmp_eq_u32 s5, 14 2961; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2962; CI-NEXT: s_cmp_eq_u32 s5, 13 2963; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 2964; CI-NEXT: s_cmp_eq_u32 s5, 12 2965; CI-NEXT: s_waitcnt vmcnt(1) 2966; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 2967; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 2968; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 2969; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 2970; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 2971; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 2972; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 2973; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 2974; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 2975; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] 2976; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2977; CI-NEXT: s_cmp_eq_u32 s5, 11 2978; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc 2979; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] 2980; CI-NEXT: s_cselect_b64 vcc, -1, 0 2981; CI-NEXT: s_cmp_eq_u32 s5, 10 2982; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] 2983; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc 2984; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 2985; CI-NEXT: s_cselect_b64 vcc, -1, 0 2986; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 2987; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 2988; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 2989; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 2990; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 2991; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 2992; CI-NEXT: v_or_b32_e32 v9, v9, v12 2993; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 2994; CI-NEXT: v_or_b32_e32 v8, v8, v12 2995; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 2996; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 2997; CI-NEXT: s_waitcnt vmcnt(0) 2998; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 2999; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 3000; CI-NEXT: s_cmp_eq_u32 s5, 9 3001; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3002; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 3003; CI-NEXT: s_cselect_b64 vcc, -1, 0 3004; CI-NEXT: s_cmp_eq_u32 s5, 8 3005; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 3006; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc 3007; CI-NEXT: s_cselect_b64 vcc, -1, 0 3008; CI-NEXT: s_cmp_eq_u32 s5, 7 3009; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3010; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 3011; CI-NEXT: s_cselect_b64 vcc, -1, 0 3012; CI-NEXT: s_cmp_eq_u32 s5, 6 3013; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc 3014; CI-NEXT: s_cselect_b64 vcc, -1, 0 3015; CI-NEXT: s_cmp_eq_u32 s5, 5 3016; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3017; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3018; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 3019; CI-NEXT: s_cselect_b64 vcc, -1, 0 3020; CI-NEXT: s_cmp_eq_u32 s5, 4 3021; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 3022; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3023; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3024; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc 3025; CI-NEXT: s_cselect_b64 vcc, -1, 0 3026; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3027; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3028; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 3029; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3030; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 3031; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 3032; CI-NEXT: v_or_b32_e32 v10, v10, v11 3033; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 3034; CI-NEXT: v_or_b32_e32 v7, v7, v12 3035; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 3036; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3037; CI-NEXT: v_or_b32_e32 v3, v3, v12 3038; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 3039; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 3040; CI-NEXT: v_or_b32_e32 v2, v2, v12 3041; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 3042; CI-NEXT: s_cmp_eq_u32 s5, 3 3043; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 3044; CI-NEXT: s_cselect_b64 vcc, -1, 0 3045; CI-NEXT: s_cmp_eq_u32 s5, 2 3046; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 3047; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc 3048; CI-NEXT: s_cselect_b64 vcc, -1, 0 3049; CI-NEXT: s_cmp_eq_u32 s5, 1 3050; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 3051; CI-NEXT: s_cselect_b64 vcc, -1, 0 3052; CI-NEXT: s_cmp_eq_u32 s5, 0 3053; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3054; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc 3055; CI-NEXT: s_cselect_b64 vcc, -1, 0 3056; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3057; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3058; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 3059; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 3060; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 3061; CI-NEXT: v_or_b32_e32 v1, v1, v6 3062; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 3063; CI-NEXT: v_or_b32_e32 v0, v0, v6 3064; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3065; CI-NEXT: s_nop 0 3066; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 3067; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 3068; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] 3069; CI-NEXT: s_endpgm 3070; 3071; GFX11-LABEL: v_insertelement_v16f16_dynamic: 3072; GFX11: ; %bb.0: 3073; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 3074; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3075; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 3076; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3077; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 3078; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3079; GFX11-NEXT: s_clause 0x1 3080; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] 3081; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 3082; GFX11-NEXT: s_cmp_eq_u32 s5, 6 3083; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3084; GFX11-NEXT: s_cmp_eq_u32 s5, 7 3085; GFX11-NEXT: s_waitcnt vmcnt(1) 3086; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2 3087; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3088; GFX11-NEXT: s_cmp_eq_u32 s5, 4 3089; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 3090; GFX11-NEXT: s_cselect_b32 s3, -1, 0 3091; GFX11-NEXT: s_cmp_eq_u32 s5, 5 3092; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 3093; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 3094; GFX11-NEXT: s_cselect_b32 s3, -1, 0 3095; GFX11-NEXT: s_cmp_eq_u32 s5, 2 3096; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 3097; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 3098; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3099; GFX11-NEXT: s_cmp_eq_u32 s5, 3 3100; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 3101; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3102; GFX11-NEXT: s_cmp_eq_u32 s5, 0 3103; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 3104; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2 3105; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3106; GFX11-NEXT: s_cmp_eq_u32 s5, 1 3107; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 3108; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3109; GFX11-NEXT: s_cmp_eq_u32 s5, 14 3110; GFX11-NEXT: s_waitcnt vmcnt(0) 3111; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 3112; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3 3113; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 3114; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2 3115; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3116; GFX11-NEXT: s_cmp_eq_u32 s5, 15 3117; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 3118; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3119; GFX11-NEXT: s_cmp_eq_u32 s5, 12 3120; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 3121; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 3122; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2 3123; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3124; GFX11-NEXT: s_cmp_eq_u32 s5, 13 3125; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2 3126; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3127; GFX11-NEXT: s_cmp_eq_u32 s5, 10 3128; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 3129; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2 3130; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3131; GFX11-NEXT: s_cmp_eq_u32 s5, 11 3132; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2 3133; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3134; GFX11-NEXT: s_cmp_eq_u32 s5, 8 3135; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 3136; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2 3137; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3138; GFX11-NEXT: s_cmp_eq_u32 s5, 9 3139; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2 3140; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3141; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 3142; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2 3143; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 3144; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 3145; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 3146; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 3147; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 3148; GFX11-NEXT: s_clause 0x1 3149; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 3150; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 3151; GFX11-NEXT: s_endpgm 3152 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 3153 %tid.ext = sext i32 %tid to i64 3154 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext 3155 %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext 3156 %vec = load <16 x half>, ptr addrspace(1) %in.gep 3157 %val.trunc = trunc i32 %val to i16 3158 %val.cvt = bitcast i16 %val.trunc to half 3159 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n 3160 store <16 x half> %vecins, ptr addrspace(1) %out.gep 3161 ret void 3162} 3163 3164 3165declare i32 @llvm.amdgcn.workitem.id.x() #1 3166 3167attributes #0 = { nounwind } 3168attributes #1 = { nounwind readnone } 3169