1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s 4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s 5; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 6 7define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { 8; SI-LABEL: s_insertelement_v2bf16_0: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 11; SI-NEXT: s_waitcnt lgkmcnt(0) 12; SI-NEXT: s_load_dword s4, s[2:3], 0x0 13; SI-NEXT: s_mov_b32 s3, 0x100f000 14; SI-NEXT: s_mov_b32 s2, -1 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_and_b32 s4, s4, 0xffff0000 17; SI-NEXT: s_or_b32 s4, s4, 0x40a0 18; SI-NEXT: v_mov_b32_e32 v0, s4 19; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: s_insertelement_v2bf16_0: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: s_load_dword s2, s[2:3], 0x0 27; VI-NEXT: v_mov_b32_e32 v0, s0 28; VI-NEXT: v_mov_b32_e32 v1, s1 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 31; VI-NEXT: s_or_b32 s0, s0, 0x40a0 32; VI-NEXT: v_mov_b32_e32 v2, s0 33; VI-NEXT: flat_store_dword v[0:1], v2 34; VI-NEXT: s_endpgm 35; 36; GFX900-LABEL: s_insertelement_v2bf16_0: 37; GFX900: ; %bb.0: 38; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 39; GFX900-NEXT: v_mov_b32_e32 v0, 0 40; GFX900-NEXT: s_waitcnt lgkmcnt(0) 41; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 42; GFX900-NEXT: s_waitcnt lgkmcnt(0) 43; GFX900-NEXT: s_lshr_b32 s2, s2, 16 44; GFX900-NEXT: s_pack_ll_b32_b16 s2, 0x40a0, s2 45; GFX900-NEXT: v_mov_b32_e32 v1, s2 46; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 47; GFX900-NEXT: s_endpgm 48; 49; GFX940-LABEL: s_insertelement_v2bf16_0: 50; GFX940: ; %bb.0: 51; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 52; GFX940-NEXT: v_mov_b32_e32 v0, 0 53; GFX940-NEXT: s_waitcnt lgkmcnt(0) 54; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 55; GFX940-NEXT: s_waitcnt lgkmcnt(0) 56; GFX940-NEXT: s_lshr_b32 s2, s2, 16 57; GFX940-NEXT: s_pack_ll_b32_b16 s2, 0x40a0, s2 58; GFX940-NEXT: v_mov_b32_e32 v1, s2 59; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 60; GFX940-NEXT: s_endpgm 61 %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr 62 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0 63 store <2 x bfloat> %vecins, ptr addrspace(1) %out 64 ret void 65} 66 67define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { 68; SI-LABEL: s_insertelement_v2bf16_1: 69; SI: ; %bb.0: 70; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 71; SI-NEXT: s_waitcnt lgkmcnt(0) 72; SI-NEXT: s_load_dword s4, s[2:3], 0x0 73; SI-NEXT: s_mov_b32 s3, 0x100f000 74; SI-NEXT: s_mov_b32 s2, -1 75; SI-NEXT: s_waitcnt lgkmcnt(0) 76; SI-NEXT: s_and_b32 s4, s4, 0xffff 77; SI-NEXT: s_or_b32 s4, s4, 0x40a00000 78; SI-NEXT: v_mov_b32_e32 v0, s4 79; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 80; SI-NEXT: s_endpgm 81; 82; VI-LABEL: s_insertelement_v2bf16_1: 83; VI: ; %bb.0: 84; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 85; VI-NEXT: s_waitcnt lgkmcnt(0) 86; VI-NEXT: s_load_dword s2, s[2:3], 0x0 87; VI-NEXT: v_mov_b32_e32 v0, s0 88; VI-NEXT: v_mov_b32_e32 v1, s1 89; VI-NEXT: s_waitcnt lgkmcnt(0) 90; VI-NEXT: s_and_b32 s0, s2, 0xffff 91; VI-NEXT: s_or_b32 s0, s0, 0x40a00000 92; VI-NEXT: v_mov_b32_e32 v2, s0 93; VI-NEXT: flat_store_dword v[0:1], v2 94; VI-NEXT: s_endpgm 95; 96; GFX900-LABEL: s_insertelement_v2bf16_1: 97; GFX900: ; %bb.0: 98; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 99; GFX900-NEXT: v_mov_b32_e32 v0, 0 100; GFX900-NEXT: s_waitcnt lgkmcnt(0) 101; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 102; GFX900-NEXT: s_waitcnt lgkmcnt(0) 103; GFX900-NEXT: s_pack_ll_b32_b16 s2, s2, 0x40a0 104; GFX900-NEXT: v_mov_b32_e32 v1, s2 105; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 106; GFX900-NEXT: s_endpgm 107; 108; GFX940-LABEL: s_insertelement_v2bf16_1: 109; GFX940: ; %bb.0: 110; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 111; GFX940-NEXT: v_mov_b32_e32 v0, 0 112; GFX940-NEXT: s_waitcnt lgkmcnt(0) 113; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 114; GFX940-NEXT: s_waitcnt lgkmcnt(0) 115; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 0x40a0 116; GFX940-NEXT: v_mov_b32_e32 v1, s2 117; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 118; GFX940-NEXT: s_endpgm 119 %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr 120 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1 121 store <2 x bfloat> %vecins, ptr addrspace(1) %out 122 ret void 123} 124 125define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 126; SI-LABEL: v_insertelement_v2bf16_0: 127; SI: ; %bb.0: 128; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 129; SI-NEXT: s_mov_b32 s7, 0x100f000 130; SI-NEXT: s_mov_b32 s6, 0 131; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 132; SI-NEXT: v_mov_b32_e32 v1, 0 133; SI-NEXT: s_waitcnt lgkmcnt(0) 134; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 135; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 136; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 137; SI-NEXT: s_waitcnt vmcnt(0) 138; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 139; SI-NEXT: v_or_b32_e32 v2, 0x40a0, v2 140; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 141; SI-NEXT: s_endpgm 142; 143; VI-LABEL: v_insertelement_v2bf16_0: 144; VI: ; %bb.0: 145; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 146; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 147; VI-NEXT: s_waitcnt lgkmcnt(0) 148; VI-NEXT: v_mov_b32_e32 v1, s3 149; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 150; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 151; VI-NEXT: flat_load_dword v3, v[0:1] 152; VI-NEXT: v_mov_b32_e32 v1, s1 153; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 154; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 155; VI-NEXT: s_waitcnt vmcnt(0) 156; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 157; VI-NEXT: v_or_b32_e32 v2, 0x40a0, v2 158; VI-NEXT: flat_store_dword v[0:1], v2 159; VI-NEXT: s_endpgm 160; 161; GFX900-LABEL: v_insertelement_v2bf16_0: 162; GFX900: ; %bb.0: 163; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 164; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 165; GFX900-NEXT: v_mov_b32_e32 v2, 0x40a0 166; GFX900-NEXT: s_waitcnt lgkmcnt(0) 167; GFX900-NEXT: global_load_dword v1, v0, s[2:3] 168; GFX900-NEXT: s_mov_b32 s2, 0xffff 169; GFX900-NEXT: s_waitcnt vmcnt(0) 170; GFX900-NEXT: v_bfi_b32 v1, s2, v2, v1 171; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 172; GFX900-NEXT: s_endpgm 173; 174; GFX940-LABEL: v_insertelement_v2bf16_0: 175; GFX940: ; %bb.0: 176; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 177; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 178; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 179; GFX940-NEXT: v_mov_b32_e32 v2, 0x40a0 180; GFX940-NEXT: s_waitcnt lgkmcnt(0) 181; GFX940-NEXT: global_load_dword v1, v0, s[2:3] 182; GFX940-NEXT: s_mov_b32 s2, 0xffff 183; GFX940-NEXT: s_waitcnt vmcnt(0) 184; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 185; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 186; GFX940-NEXT: s_endpgm 187 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 188 %tid.ext = sext i32 %tid to i64 189 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 190 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 191 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep 192 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0 193 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep 194 ret void 195} 196 197define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 198; SI-LABEL: v_insertelement_v2bf16_0_inlineimm: 199; SI: ; %bb.0: 200; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 201; SI-NEXT: s_mov_b32 s7, 0x100f000 202; SI-NEXT: s_mov_b32 s6, 0 203; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 204; SI-NEXT: v_mov_b32_e32 v1, 0 205; SI-NEXT: s_waitcnt lgkmcnt(0) 206; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 207; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 208; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 209; SI-NEXT: s_waitcnt vmcnt(0) 210; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 211; SI-NEXT: v_or_b32_e32 v2, 53, v2 212; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 213; SI-NEXT: s_endpgm 214; 215; VI-LABEL: v_insertelement_v2bf16_0_inlineimm: 216; VI: ; %bb.0: 217; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 218; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 219; VI-NEXT: s_waitcnt lgkmcnt(0) 220; VI-NEXT: v_mov_b32_e32 v1, s3 221; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 222; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 223; VI-NEXT: flat_load_dword v3, v[0:1] 224; VI-NEXT: v_mov_b32_e32 v1, s1 225; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 226; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 227; VI-NEXT: s_waitcnt vmcnt(0) 228; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 229; VI-NEXT: v_or_b32_e32 v2, 53, v2 230; VI-NEXT: flat_store_dword v[0:1], v2 231; VI-NEXT: s_endpgm 232; 233; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm: 234; GFX900: ; %bb.0: 235; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 236; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 237; GFX900-NEXT: s_waitcnt lgkmcnt(0) 238; GFX900-NEXT: global_load_dword v1, v0, s[2:3] 239; GFX900-NEXT: s_mov_b32 s2, 0xffff 240; GFX900-NEXT: s_waitcnt vmcnt(0) 241; GFX900-NEXT: v_bfi_b32 v1, s2, 53, v1 242; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 243; GFX900-NEXT: s_endpgm 244; 245; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm: 246; GFX940: ; %bb.0: 247; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 248; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 249; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; GFX940-NEXT: s_waitcnt lgkmcnt(0) 251; GFX940-NEXT: global_load_dword v1, v0, s[2:3] 252; GFX940-NEXT: s_mov_b32 s2, 0xffff 253; GFX940-NEXT: s_waitcnt vmcnt(0) 254; GFX940-NEXT: v_bfi_b32 v1, s2, 53, v1 255; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 256; GFX940-NEXT: s_endpgm 257 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 258 %tid.ext = sext i32 %tid to i64 259 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 260 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 261 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep 262 %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR0035, i32 0 263 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep 264 ret void 265} 266 267define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 268; SI-LABEL: v_insertelement_v2bf16_1: 269; SI: ; %bb.0: 270; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 271; SI-NEXT: s_mov_b32 s7, 0x100f000 272; SI-NEXT: s_mov_b32 s6, 0 273; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 274; SI-NEXT: v_mov_b32_e32 v1, 0 275; SI-NEXT: s_waitcnt lgkmcnt(0) 276; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 277; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 278; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 279; SI-NEXT: s_waitcnt vmcnt(0) 280; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 281; SI-NEXT: v_or_b32_e32 v2, 0x40a00000, v2 282; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 283; SI-NEXT: s_endpgm 284; 285; VI-LABEL: v_insertelement_v2bf16_1: 286; VI: ; %bb.0: 287; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 288; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 289; VI-NEXT: s_waitcnt lgkmcnt(0) 290; VI-NEXT: v_mov_b32_e32 v1, s3 291; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 292; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 293; VI-NEXT: flat_load_dword v3, v[0:1] 294; VI-NEXT: v_mov_b32_e32 v1, s1 295; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 296; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 297; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 298; VI-NEXT: s_waitcnt vmcnt(0) 299; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 300; VI-NEXT: flat_store_dword v[0:1], v2 301; VI-NEXT: s_endpgm 302; 303; GFX900-LABEL: v_insertelement_v2bf16_1: 304; GFX900: ; %bb.0: 305; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 306; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 307; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 308; GFX900-NEXT: s_waitcnt lgkmcnt(0) 309; GFX900-NEXT: global_load_dword v1, v0, s[2:3] 310; GFX900-NEXT: s_movk_i32 s2, 0x40a0 311; GFX900-NEXT: s_waitcnt vmcnt(0) 312; GFX900-NEXT: v_perm_b32 v1, s2, v1, v2 313; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 314; GFX900-NEXT: s_endpgm 315; 316; GFX940-LABEL: v_insertelement_v2bf16_1: 317; GFX940: ; %bb.0: 318; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 319; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 320; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 321; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 322; GFX940-NEXT: s_waitcnt lgkmcnt(0) 323; GFX940-NEXT: global_load_dword v1, v0, s[2:3] 324; GFX940-NEXT: s_movk_i32 s2, 0x40a0 325; GFX940-NEXT: s_waitcnt vmcnt(0) 326; GFX940-NEXT: v_perm_b32 v1, s2, v1, v2 327; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 328; GFX940-NEXT: s_endpgm 329 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 330 %tid.ext = sext i32 %tid to i64 331 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 332 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 333 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep 334 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1 335 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep 336 ret void 337} 338 339define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 340; SI-LABEL: v_insertelement_v2bf16_1_inlineimm: 341; SI: ; %bb.0: 342; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 343; SI-NEXT: s_mov_b32 s7, 0x100f000 344; SI-NEXT: s_mov_b32 s6, 0 345; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 346; SI-NEXT: v_mov_b32_e32 v1, 0 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 349; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 350; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 351; SI-NEXT: s_waitcnt vmcnt(0) 352; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 353; SI-NEXT: v_or_b32_e32 v2, 0x230000, v2 354; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 355; SI-NEXT: s_endpgm 356; 357; VI-LABEL: v_insertelement_v2bf16_1_inlineimm: 358; VI: ; %bb.0: 359; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 360; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: v_mov_b32_e32 v1, s3 363; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 364; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 365; VI-NEXT: flat_load_dword v3, v[0:1] 366; VI-NEXT: v_mov_b32_e32 v1, s1 367; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 368; VI-NEXT: v_mov_b32_e32 v2, 0x230000 369; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 370; VI-NEXT: s_waitcnt vmcnt(0) 371; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 372; VI-NEXT: flat_store_dword v[0:1], v2 373; VI-NEXT: s_endpgm 374; 375; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm: 376; GFX900: ; %bb.0: 377; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 378; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 379; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 380; GFX900-NEXT: s_waitcnt lgkmcnt(0) 381; GFX900-NEXT: global_load_dword v1, v0, s[2:3] 382; GFX900-NEXT: s_waitcnt vmcnt(0) 383; GFX900-NEXT: v_perm_b32 v1, 35, v1, v2 384; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 385; GFX900-NEXT: s_endpgm 386; 387; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm: 388; GFX940: ; %bb.0: 389; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 390; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 391; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 392; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 393; GFX940-NEXT: s_waitcnt lgkmcnt(0) 394; GFX940-NEXT: global_load_dword v1, v0, s[2:3] 395; GFX940-NEXT: s_waitcnt vmcnt(0) 396; GFX940-NEXT: v_perm_b32 v1, 35, v1, v2 397; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 398; GFX940-NEXT: s_endpgm 399 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 400 %tid.ext = sext i32 %tid to i64 401 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 402 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 403 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep 404 %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR0023, i32 1 405 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep 406 ret void 407} 408 409define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { 410; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: 411; SI: ; %bb.0: 412; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 413; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 414; SI-NEXT: s_mov_b32 s11, 0x100f000 415; SI-NEXT: s_mov_b32 s10, 0 416; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 417; SI-NEXT: v_mov_b32_e32 v1, 0 418; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 419; SI-NEXT: s_waitcnt lgkmcnt(0) 420; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 421; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 422; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 423; SI-NEXT: s_mov_b32 s4, 0x12341234 424; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 425; SI-NEXT: s_waitcnt vmcnt(1) 426; SI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 427; SI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 428; SI-NEXT: s_waitcnt vmcnt(0) 429; SI-NEXT: v_bfi_b32 v2, v2, s4, v3 430; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 431; SI-NEXT: s_endpgm 432; 433; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: 434; VI: ; %bb.0: 435; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 436; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 437; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 438; VI-NEXT: s_waitcnt lgkmcnt(0) 439; VI-NEXT: v_mov_b32_e32 v3, s3 440; VI-NEXT: v_mov_b32_e32 v1, s5 441; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 442; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 443; VI-NEXT: flat_load_dword v4, v[0:1] 444; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 445; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 446; VI-NEXT: flat_load_dword v3, v[0:1] 447; VI-NEXT: s_mov_b32 s2, 0xffff 448; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 449; VI-NEXT: v_mov_b32_e32 v1, s1 450; VI-NEXT: s_mov_b32 s0, 0x12341234 451; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 452; VI-NEXT: s_waitcnt vmcnt(1) 453; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 454; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 455; VI-NEXT: s_waitcnt vmcnt(0) 456; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 457; VI-NEXT: flat_store_dword v[0:1], v2 458; VI-NEXT: s_endpgm 459; 460; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr: 461; GFX900: ; %bb.0: 462; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 463; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 464; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 465; GFX900-NEXT: s_waitcnt lgkmcnt(0) 466; GFX900-NEXT: global_load_dword v1, v0, s[4:5] 467; GFX900-NEXT: global_load_dword v2, v0, s[2:3] 468; GFX900-NEXT: s_mov_b32 s2, 0xffff 469; GFX900-NEXT: s_waitcnt vmcnt(1) 470; GFX900-NEXT: v_lshlrev_b32_e32 v1, 4, v1 471; GFX900-NEXT: v_lshlrev_b32_e64 v1, v1, s2 472; GFX900-NEXT: s_mov_b32 s2, 0x12341234 473; GFX900-NEXT: s_waitcnt vmcnt(0) 474; GFX900-NEXT: v_bfi_b32 v1, v1, s2, v2 475; GFX900-NEXT: global_store_dword v0, v1, s[0:1] 476; GFX900-NEXT: s_endpgm 477; 478; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr: 479; GFX940: ; %bb.0: 480; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 481; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 482; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 483; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 484; GFX940-NEXT: s_waitcnt lgkmcnt(0) 485; GFX940-NEXT: global_load_dword v1, v0, s[6:7] 486; GFX940-NEXT: global_load_dword v2, v0, s[2:3] 487; GFX940-NEXT: s_mov_b32 s2, 0xffff 488; GFX940-NEXT: s_waitcnt vmcnt(1) 489; GFX940-NEXT: v_lshlrev_b32_e32 v1, 4, v1 490; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s2 491; GFX940-NEXT: s_mov_b32 s2, 0x12341234 492; GFX940-NEXT: s_waitcnt vmcnt(0) 493; GFX940-NEXT: v_bfi_b32 v1, v1, s2, v2 494; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 495; GFX940-NEXT: s_endpgm 496 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 497 %tid.ext = sext i32 %tid to i64 498 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 499 %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext 500 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 501 %idx = load i32, ptr addrspace(1) %idx.gep 502 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep 503 %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR1234, i32 %idx 504 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep 505 ret void 506} 507 508define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { 509; SI-LABEL: v_insertelement_v4bf16_0: 510; SI: ; %bb.0: 511; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 512; SI-NEXT: s_mov_b32 s7, 0x100f000 513; SI-NEXT: s_mov_b32 s6, 0 514; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 515; SI-NEXT: v_mov_b32_e32 v1, 0 516; SI-NEXT: s_waitcnt lgkmcnt(0) 517; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 518; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 519; SI-NEXT: s_load_dword s8, s[8:9], 0xc 520; SI-NEXT: s_mov_b32 s4, 0xffff 521; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 522; SI-NEXT: s_waitcnt lgkmcnt(0) 523; SI-NEXT: v_mov_b32_e32 v4, s8 524; SI-NEXT: s_waitcnt vmcnt(0) 525; SI-NEXT: v_bfi_b32 v2, s4, v4, v2 526; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 527; SI-NEXT: s_endpgm 528; 529; VI-LABEL: v_insertelement_v4bf16_0: 530; VI: ; %bb.0: 531; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 532; VI-NEXT: s_load_dword s4, s[8:9], 0x30 533; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 534; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 535; VI-NEXT: s_waitcnt lgkmcnt(0) 536; VI-NEXT: v_mov_b32_e32 v1, s3 537; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 538; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 539; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 540; VI-NEXT: v_mov_b32_e32 v3, s1 541; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 542; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 543; VI-NEXT: s_waitcnt vmcnt(0) 544; VI-NEXT: v_perm_b32 v0, s4, v0, v4 545; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 546; VI-NEXT: s_endpgm 547; 548; GFX900-LABEL: v_insertelement_v4bf16_0: 549; GFX900: ; %bb.0: 550; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 551; GFX900-NEXT: s_load_dword s4, s[8:9], 0x30 552; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 553; GFX900-NEXT: s_waitcnt lgkmcnt(0) 554; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 555; GFX900-NEXT: s_mov_b32 s2, 0xffff 556; GFX900-NEXT: v_mov_b32_e32 v3, s4 557; GFX900-NEXT: s_waitcnt vmcnt(0) 558; GFX900-NEXT: v_bfi_b32 v0, s2, v3, v0 559; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 560; GFX900-NEXT: s_endpgm 561; 562; GFX940-LABEL: v_insertelement_v4bf16_0: 563; GFX940: ; %bb.0: 564; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 565; GFX940-NEXT: s_load_dword s6, s[4:5], 0x30 566; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 567; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 568; GFX940-NEXT: s_waitcnt lgkmcnt(0) 569; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 570; GFX940-NEXT: s_mov_b32 s2, 0xffff 571; GFX940-NEXT: v_mov_b32_e32 v3, s6 572; GFX940-NEXT: s_waitcnt vmcnt(0) 573; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 574; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 575; GFX940-NEXT: s_endpgm 576 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 577 %tid.ext = sext i32 %tid to i64 578 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 579 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 580 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep 581 %val.trunc = trunc i32 %val to i16 582 %val.cvt = bitcast i16 %val.trunc to bfloat 583 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 0 584 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep 585 ret void 586} 587 588define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { 589; SI-LABEL: v_insertelement_v4bf16_1: 590; SI: ; %bb.0: 591; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 592; SI-NEXT: s_mov_b32 s7, 0x100f000 593; SI-NEXT: s_mov_b32 s6, 0 594; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 595; SI-NEXT: v_mov_b32_e32 v1, 0 596; SI-NEXT: s_waitcnt lgkmcnt(0) 597; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 598; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 599; SI-NEXT: s_load_dword s8, s[8:9], 0x4 600; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 601; SI-NEXT: s_waitcnt lgkmcnt(0) 602; SI-NEXT: s_lshl_b32 s4, s8, 16 603; SI-NEXT: s_waitcnt vmcnt(0) 604; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 605; SI-NEXT: v_or_b32_e32 v2, s4, v2 606; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 607; SI-NEXT: s_endpgm 608; 609; VI-LABEL: v_insertelement_v4bf16_1: 610; VI: ; %bb.0: 611; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 612; VI-NEXT: s_load_dword s4, s[8:9], 0x10 613; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 614; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 615; VI-NEXT: s_waitcnt lgkmcnt(0) 616; VI-NEXT: v_mov_b32_e32 v1, s3 617; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 618; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 619; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 620; VI-NEXT: v_mov_b32_e32 v3, s1 621; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 622; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 623; VI-NEXT: s_waitcnt vmcnt(0) 624; VI-NEXT: v_perm_b32 v0, v0, s4, v4 625; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 626; VI-NEXT: s_endpgm 627; 628; GFX900-LABEL: v_insertelement_v4bf16_1: 629; GFX900: ; %bb.0: 630; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 631; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 632; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 633; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 634; GFX900-NEXT: s_waitcnt lgkmcnt(0) 635; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 636; GFX900-NEXT: s_waitcnt vmcnt(0) 637; GFX900-NEXT: v_perm_b32 v0, s4, v0, v3 638; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 639; GFX900-NEXT: s_endpgm 640; 641; GFX940-LABEL: v_insertelement_v4bf16_1: 642; GFX940: ; %bb.0: 643; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 644; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 645; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 646; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 647; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 648; GFX940-NEXT: s_waitcnt lgkmcnt(0) 649; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 650; GFX940-NEXT: s_waitcnt vmcnt(0) 651; GFX940-NEXT: v_perm_b32 v0, s6, v0, v3 652; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 653; GFX940-NEXT: s_endpgm 654 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 655 %tid.ext = sext i32 %tid to i64 656 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 657 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 658 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep 659 %val.trunc = trunc i32 %val to i16 660 %val.cvt = bitcast i16 %val.trunc to bfloat 661 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 1 662 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep 663 ret void 664} 665 666define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { 667; SI-LABEL: v_insertelement_v4bf16_2: 668; SI: ; %bb.0: 669; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 670; SI-NEXT: s_mov_b32 s7, 0x100f000 671; SI-NEXT: s_mov_b32 s6, 0 672; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 673; SI-NEXT: v_mov_b32_e32 v1, 0 674; SI-NEXT: s_waitcnt lgkmcnt(0) 675; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 676; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 677; SI-NEXT: s_load_dword s8, s[8:9], 0xc 678; SI-NEXT: s_mov_b32 s4, 0xffff 679; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 680; SI-NEXT: s_waitcnt lgkmcnt(0) 681; SI-NEXT: v_mov_b32_e32 v4, s8 682; SI-NEXT: s_waitcnt vmcnt(0) 683; SI-NEXT: v_bfi_b32 v3, s4, v4, v3 684; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 685; SI-NEXT: s_endpgm 686; 687; VI-LABEL: v_insertelement_v4bf16_2: 688; VI: ; %bb.0: 689; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 690; VI-NEXT: s_load_dword s4, s[8:9], 0x30 691; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 692; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 693; VI-NEXT: s_waitcnt lgkmcnt(0) 694; VI-NEXT: v_mov_b32_e32 v1, s3 695; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 696; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 697; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 698; VI-NEXT: v_mov_b32_e32 v3, s1 699; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 700; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 701; VI-NEXT: s_waitcnt vmcnt(0) 702; VI-NEXT: v_perm_b32 v1, s4, v1, v4 703; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 704; VI-NEXT: s_endpgm 705; 706; GFX900-LABEL: v_insertelement_v4bf16_2: 707; GFX900: ; %bb.0: 708; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 709; GFX900-NEXT: s_load_dword s4, s[8:9], 0x30 710; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 711; GFX900-NEXT: s_waitcnt lgkmcnt(0) 712; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 713; GFX900-NEXT: s_mov_b32 s2, 0xffff 714; GFX900-NEXT: v_mov_b32_e32 v3, s4 715; GFX900-NEXT: s_waitcnt vmcnt(0) 716; GFX900-NEXT: v_bfi_b32 v1, s2, v3, v1 717; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 718; GFX900-NEXT: s_endpgm 719; 720; GFX940-LABEL: v_insertelement_v4bf16_2: 721; GFX940: ; %bb.0: 722; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 723; GFX940-NEXT: s_load_dword s6, s[4:5], 0x30 724; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 725; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 726; GFX940-NEXT: s_waitcnt lgkmcnt(0) 727; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 728; GFX940-NEXT: s_mov_b32 s2, 0xffff 729; GFX940-NEXT: v_mov_b32_e32 v3, s6 730; GFX940-NEXT: s_waitcnt vmcnt(0) 731; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v1 732; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 733; GFX940-NEXT: s_endpgm 734 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 735 %tid.ext = sext i32 %tid to i64 736 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 737 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 738 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep 739 %val.trunc = trunc i32 %val to i16 740 %val.cvt = bitcast i16 %val.trunc to bfloat 741 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 2 742 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep 743 ret void 744} 745 746define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { 747; SI-LABEL: v_insertelement_v4bf16_3: 748; SI: ; %bb.0: 749; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 750; SI-NEXT: s_mov_b32 s7, 0x100f000 751; SI-NEXT: s_mov_b32 s6, 0 752; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 753; SI-NEXT: v_mov_b32_e32 v1, 0 754; SI-NEXT: s_waitcnt lgkmcnt(0) 755; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 756; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 757; SI-NEXT: s_load_dword s8, s[8:9], 0x4 758; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 759; SI-NEXT: s_waitcnt lgkmcnt(0) 760; SI-NEXT: s_lshl_b32 s4, s8, 16 761; SI-NEXT: s_waitcnt vmcnt(0) 762; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 763; SI-NEXT: v_or_b32_e32 v3, s4, v3 764; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 765; SI-NEXT: s_endpgm 766; 767; VI-LABEL: v_insertelement_v4bf16_3: 768; VI: ; %bb.0: 769; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 770; VI-NEXT: s_load_dword s4, s[8:9], 0x10 771; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 772; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 773; VI-NEXT: s_waitcnt lgkmcnt(0) 774; VI-NEXT: v_mov_b32_e32 v1, s3 775; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 776; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 777; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 778; VI-NEXT: v_mov_b32_e32 v3, s1 779; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 780; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 781; VI-NEXT: s_waitcnt vmcnt(0) 782; VI-NEXT: v_perm_b32 v1, v1, s4, v4 783; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 784; VI-NEXT: s_endpgm 785; 786; GFX900-LABEL: v_insertelement_v4bf16_3: 787; GFX900: ; %bb.0: 788; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 789; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 790; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 791; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 792; GFX900-NEXT: s_waitcnt lgkmcnt(0) 793; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 794; GFX900-NEXT: s_waitcnt vmcnt(0) 795; GFX900-NEXT: v_perm_b32 v1, s4, v1, v3 796; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 797; GFX900-NEXT: s_endpgm 798; 799; GFX940-LABEL: v_insertelement_v4bf16_3: 800; GFX940: ; %bb.0: 801; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 802; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 803; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 804; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 805; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 806; GFX940-NEXT: s_waitcnt lgkmcnt(0) 807; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 808; GFX940-NEXT: s_waitcnt vmcnt(0) 809; GFX940-NEXT: v_perm_b32 v1, s6, v1, v3 810; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 811; GFX940-NEXT: s_endpgm 812 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 813 %tid.ext = sext i32 %tid to i64 814 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 815 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 816 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep 817 %val.trunc = trunc i32 %val to i16 818 %val.cvt = bitcast i16 %val.trunc to bfloat 819 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 3 820 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep 821 ret void 822} 823 824define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { 825; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: 826; SI: ; %bb.0: 827; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 828; SI-NEXT: s_mov_b32 s7, 0x100f000 829; SI-NEXT: s_mov_b32 s6, 0 830; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 831; SI-NEXT: v_mov_b32_e32 v1, 0 832; SI-NEXT: s_waitcnt lgkmcnt(0) 833; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 834; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 835; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4 836; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 837; SI-NEXT: s_waitcnt lgkmcnt(0) 838; SI-NEXT: s_lshl_b32 s4, s8, 16 839; SI-NEXT: s_and_b32 s5, s8, 0xffff 840; SI-NEXT: s_lshl_b32 s6, s9, 4 841; SI-NEXT: s_or_b32 s7, s5, s4 842; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s6 843; SI-NEXT: v_mov_b32_e32 v4, s7 844; SI-NEXT: v_mov_b32_e32 v5, s7 845; SI-NEXT: s_waitcnt vmcnt(0) 846; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 847; SI-NEXT: v_bfi_b32 v2, s4, v5, v2 848; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 849; SI-NEXT: s_endpgm 850; 851; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: 852; VI: ; %bb.0: 853; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 854; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 855; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 856; VI-NEXT: s_waitcnt lgkmcnt(0) 857; VI-NEXT: v_mov_b32_e32 v1, s3 858; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 859; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 860; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 861; VI-NEXT: v_mov_b32_e32 v3, s1 862; VI-NEXT: s_lshl_b32 s1, s4, 16 863; VI-NEXT: s_and_b32 s2, s4, 0xffff 864; VI-NEXT: s_lshl_b32 s3, s5, 4 865; VI-NEXT: s_or_b32 s2, s2, s1 866; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 867; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 868; VI-NEXT: v_mov_b32_e32 v4, s2 869; VI-NEXT: v_mov_b32_e32 v5, s2 870; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 871; VI-NEXT: s_waitcnt vmcnt(0) 872; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 873; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 874; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 875; VI-NEXT: s_endpgm 876; 877; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr: 878; GFX900: ; %bb.0: 879; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 880; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 881; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 882; GFX900-NEXT: s_waitcnt lgkmcnt(0) 883; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 884; GFX900-NEXT: s_lshl_b32 s2, s5, 4 885; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 886; GFX900-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 887; GFX900-NEXT: v_mov_b32_e32 v3, s4 888; GFX900-NEXT: v_mov_b32_e32 v4, s4 889; GFX900-NEXT: s_waitcnt vmcnt(0) 890; GFX900-NEXT: v_bfi_b32 v1, s3, v3, v1 891; GFX900-NEXT: v_bfi_b32 v0, s2, v4, v0 892; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 893; GFX900-NEXT: s_endpgm 894; 895; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr: 896; GFX940: ; %bb.0: 897; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 898; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 899; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 900; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 901; GFX940-NEXT: s_waitcnt lgkmcnt(0) 902; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 903; GFX940-NEXT: s_lshl_b32 s2, s7, 4 904; GFX940-NEXT: s_pack_ll_b32_b16 s4, s6, s6 905; GFX940-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 906; GFX940-NEXT: v_mov_b32_e32 v3, s4 907; GFX940-NEXT: v_mov_b32_e32 v4, s4 908; GFX940-NEXT: s_waitcnt vmcnt(0) 909; GFX940-NEXT: v_bfi_b32 v1, s3, v3, v1 910; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v0 911; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 912; GFX940-NEXT: s_endpgm 913 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 914 %tid.ext = sext i32 %tid to i64 915 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 916 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 917 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep 918 %val.trunc = trunc i32 %val to i16 919 %val.cvt = bitcast i16 %val.trunc to bfloat 920 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 %idxval 921 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep 922 ret void 923} 924 925define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { 926; SI-LABEL: v_insertelement_v8bf16_3: 927; SI: ; %bb.0: 928; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 929; SI-NEXT: s_mov_b32 s7, 0x100f000 930; SI-NEXT: s_mov_b32 s6, 0 931; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 932; SI-NEXT: v_mov_b32_e32 v5, 0 933; SI-NEXT: s_waitcnt lgkmcnt(0) 934; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 935; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 936; SI-NEXT: s_load_dword s8, s[8:9], 0x4 937; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 938; SI-NEXT: s_waitcnt lgkmcnt(0) 939; SI-NEXT: s_lshl_b32 s4, s8, 16 940; SI-NEXT: s_waitcnt vmcnt(0) 941; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 942; SI-NEXT: v_or_b32_e32 v1, s4, v1 943; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 944; SI-NEXT: s_endpgm 945; 946; VI-LABEL: v_insertelement_v8bf16_3: 947; VI: ; %bb.0: 948; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 949; VI-NEXT: s_load_dword s4, s[8:9], 0x10 950; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 951; VI-NEXT: s_waitcnt lgkmcnt(0) 952; VI-NEXT: v_mov_b32_e32 v1, s3 953; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 954; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 955; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 956; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 957; VI-NEXT: s_lshl_b32 s0, s4, 16 958; VI-NEXT: v_mov_b32_e32 v5, s1 959; VI-NEXT: v_mov_b32_e32 v6, s0 960; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 961; VI-NEXT: s_waitcnt vmcnt(0) 962; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 963; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 964; VI-NEXT: s_endpgm 965; 966; GFX900-LABEL: v_insertelement_v8bf16_3: 967; GFX900: ; %bb.0: 968; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 969; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 970; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 971; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100 972; GFX900-NEXT: s_waitcnt lgkmcnt(0) 973; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 974; GFX900-NEXT: s_waitcnt vmcnt(0) 975; GFX900-NEXT: v_perm_b32 v1, s4, v1, v5 976; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 977; GFX900-NEXT: s_endpgm 978; 979; GFX940-LABEL: v_insertelement_v8bf16_3: 980; GFX940: ; %bb.0: 981; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 982; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 983; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 984; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 985; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 986; GFX940-NEXT: s_waitcnt lgkmcnt(0) 987; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 988; GFX940-NEXT: s_waitcnt vmcnt(0) 989; GFX940-NEXT: v_perm_b32 v1, s6, v1, v5 990; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 991; GFX940-NEXT: s_endpgm 992 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 993 %tid.ext = sext i32 %tid to i64 994 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 995 %out.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 996 %vec = load <8 x bfloat>, ptr addrspace(1) %in.gep 997 %val.trunc = trunc i32 %val to i16 998 %val.cvt = bitcast i16 %val.trunc to bfloat 999 %vecins = insertelement <8 x bfloat> %vec, bfloat %val.cvt, i32 3 1000 store <8 x bfloat> %vecins, ptr addrspace(1) %out.gep 1001 ret void 1002} 1003 1004define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { 1005; SI-LABEL: v_insertelement_v8bf16_dynamic: 1006; SI: ; %bb.0: 1007; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1008; SI-NEXT: s_mov_b32 s7, 0x100f000 1009; SI-NEXT: s_mov_b32 s6, 0 1010; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1011; SI-NEXT: v_mov_b32_e32 v5, 0 1012; SI-NEXT: s_waitcnt lgkmcnt(0) 1013; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1014; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 1015; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4 1016; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1017; SI-NEXT: s_waitcnt lgkmcnt(0) 1018; SI-NEXT: s_cmp_eq_u32 s9, 6 1019; SI-NEXT: v_mov_b32_e32 v6, s8 1020; SI-NEXT: s_cselect_b64 vcc, -1, 0 1021; SI-NEXT: s_cmp_eq_u32 s9, 7 1022; SI-NEXT: s_waitcnt vmcnt(0) 1023; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc 1024; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1025; SI-NEXT: s_cselect_b64 vcc, -1, 0 1026; SI-NEXT: s_cmp_eq_u32 s9, 4 1027; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1028; SI-NEXT: s_cselect_b64 vcc, -1, 0 1029; SI-NEXT: s_cmp_eq_u32 s9, 5 1030; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 1031; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1032; SI-NEXT: s_cselect_b64 vcc, -1, 0 1033; SI-NEXT: s_cmp_eq_u32 s9, 2 1034; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 1035; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1036; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 1037; SI-NEXT: s_cselect_b64 vcc, -1, 0 1038; SI-NEXT: s_cmp_eq_u32 s9, 3 1039; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1040; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 1041; SI-NEXT: v_or_b32_e32 v3, v7, v3 1042; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 1043; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1044; SI-NEXT: s_cselect_b64 vcc, -1, 0 1045; SI-NEXT: s_cmp_eq_u32 s9, 0 1046; SI-NEXT: v_or_b32_e32 v2, v2, v7 1047; SI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc 1048; SI-NEXT: s_cselect_b64 vcc, -1, 0 1049; SI-NEXT: s_cmp_eq_u32 s9, 1 1050; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1051; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1052; SI-NEXT: s_cselect_b64 vcc, -1, 0 1053; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 1054; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1055; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1056; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1057; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1058; SI-NEXT: v_or_b32_e32 v1, v1, v7 1059; SI-NEXT: v_or_b32_e32 v0, v0, v6 1060; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 1061; SI-NEXT: s_endpgm 1062; 1063; VI-LABEL: v_insertelement_v8bf16_dynamic: 1064; VI: ; %bb.0: 1065; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1066; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1067; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1068; VI-NEXT: s_waitcnt lgkmcnt(0) 1069; VI-NEXT: v_mov_b32_e32 v1, s3 1070; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1071; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1072; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1073; VI-NEXT: v_mov_b32_e32 v5, s1 1074; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1075; VI-NEXT: s_cmp_eq_u32 s5, 6 1076; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1077; VI-NEXT: v_mov_b32_e32 v6, s4 1078; VI-NEXT: s_cselect_b64 vcc, -1, 0 1079; VI-NEXT: s_cmp_eq_u32 s5, 7 1080; VI-NEXT: s_waitcnt vmcnt(0) 1081; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc 1082; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1083; VI-NEXT: s_cselect_b64 vcc, -1, 0 1084; VI-NEXT: s_cmp_eq_u32 s5, 4 1085; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1086; VI-NEXT: s_cselect_b64 vcc, -1, 0 1087; VI-NEXT: s_cmp_eq_u32 s5, 5 1088; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 1089; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1090; VI-NEXT: s_cselect_b64 vcc, -1, 0 1091; VI-NEXT: s_cmp_eq_u32 s5, 2 1092; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1093; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 1094; VI-NEXT: s_cselect_b64 vcc, -1, 0 1095; VI-NEXT: s_cmp_eq_u32 s5, 3 1096; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1097; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1098; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 1099; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1100; VI-NEXT: s_cselect_b64 vcc, -1, 0 1101; VI-NEXT: s_cmp_eq_u32 s5, 0 1102; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1103; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc 1104; VI-NEXT: s_cselect_b64 vcc, -1, 0 1105; VI-NEXT: s_cmp_eq_u32 s5, 1 1106; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1107; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1108; VI-NEXT: s_cselect_b64 vcc, -1, 0 1109; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 1110; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1111; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1112; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1113; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1114; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1115; VI-NEXT: s_endpgm 1116; 1117; GFX900-LABEL: v_insertelement_v8bf16_dynamic: 1118; GFX900: ; %bb.0: 1119; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1120; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1121; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1122; GFX900-NEXT: s_waitcnt lgkmcnt(0) 1123; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 1124; GFX900-NEXT: s_cmp_eq_u32 s5, 6 1125; GFX900-NEXT: v_mov_b32_e32 v5, s4 1126; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1127; GFX900-NEXT: s_cmp_eq_u32 s5, 7 1128; GFX900-NEXT: s_mov_b32 s2, 0x5040100 1129; GFX900-NEXT: s_waitcnt vmcnt(0) 1130; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc 1131; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1132; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1133; GFX900-NEXT: s_cmp_eq_u32 s5, 4 1134; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1135; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1136; GFX900-NEXT: s_cmp_eq_u32 s5, 5 1137; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2 1138; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1139; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1140; GFX900-NEXT: s_cmp_eq_u32 s5, 2 1141; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2 1142; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc 1143; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1144; GFX900-NEXT: s_cmp_eq_u32 s5, 3 1145; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1146; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1147; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1148; GFX900-NEXT: s_cmp_eq_u32 s5, 0 1149; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2 1150; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc 1151; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1152; GFX900-NEXT: s_cmp_eq_u32 s5, 1 1153; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0 1154; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 1155; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1156; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc 1157; GFX900-NEXT: v_perm_b32 v1, v6, v1, s2 1158; GFX900-NEXT: v_perm_b32 v0, v5, v0, s2 1159; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1160; GFX900-NEXT: s_endpgm 1161; 1162; GFX940-LABEL: v_insertelement_v8bf16_dynamic: 1163; GFX940: ; %bb.0: 1164; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1165; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1166; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1167; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1168; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1169; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 1170; GFX940-NEXT: s_cmp_eq_u32 s7, 6 1171; GFX940-NEXT: v_mov_b32_e32 v5, s6 1172; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1173; GFX940-NEXT: s_cmp_eq_u32 s7, 7 1174; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1175; GFX940-NEXT: s_waitcnt vmcnt(0) 1176; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc 1177; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1178; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1179; GFX940-NEXT: s_cmp_eq_u32 s7, 4 1180; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1181; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1182; GFX940-NEXT: s_cmp_eq_u32 s7, 5 1183; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v2 1184; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1185; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1186; GFX940-NEXT: s_cmp_eq_u32 s7, 2 1187; GFX940-NEXT: v_perm_b32 v3, v3, v6, s2 1188; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc 1189; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1190; GFX940-NEXT: s_cmp_eq_u32 s7, 3 1191; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1192; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1193; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1194; GFX940-NEXT: s_cmp_eq_u32 s7, 0 1195; GFX940-NEXT: v_perm_b32 v2, v6, v2, s2 1196; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc 1197; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1198; GFX940-NEXT: s_cmp_eq_u32 s7, 1 1199; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v0 1200; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 1201; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1202; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc 1203; GFX940-NEXT: v_perm_b32 v1, v6, v1, s2 1204; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 1205; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 1206; GFX940-NEXT: s_endpgm 1207 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1208 %tid.ext = sext i32 %tid to i64 1209 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 1210 %out.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 1211 %vec = load <8 x bfloat>, ptr addrspace(1) %in.gep 1212 %val.trunc = trunc i32 %val to i16 1213 %val.cvt = bitcast i16 %val.trunc to bfloat 1214 %vecins = insertelement <8 x bfloat> %vec, bfloat %val.cvt, i32 %n 1215 store <8 x bfloat> %vecins, ptr addrspace(1) %out.gep 1216 ret void 1217} 1218 1219define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { 1220; SI-LABEL: v_insertelement_v16bf16_3: 1221; SI: ; %bb.0: 1222; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1223; SI-NEXT: s_mov_b32 s7, 0x100f000 1224; SI-NEXT: s_mov_b32 s6, 0 1225; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 1226; SI-NEXT: v_mov_b32_e32 v9, 0 1227; SI-NEXT: s_waitcnt lgkmcnt(0) 1228; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1229; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 1230; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 1231; SI-NEXT: s_load_dword s8, s[8:9], 0x4 1232; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1233; SI-NEXT: s_waitcnt lgkmcnt(0) 1234; SI-NEXT: s_lshl_b32 s4, s8, 16 1235; SI-NEXT: s_waitcnt vmcnt(1) 1236; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1237; SI-NEXT: v_or_b32_e32 v1, s4, v1 1238; SI-NEXT: s_waitcnt vmcnt(0) 1239; SI-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 offset:16 1240; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 1241; SI-NEXT: s_endpgm 1242; 1243; VI-LABEL: v_insertelement_v16bf16_3: 1244; VI: ; %bb.0: 1245; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1246; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1247; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 1248; VI-NEXT: s_waitcnt lgkmcnt(0) 1249; VI-NEXT: v_mov_b32_e32 v1, s3 1250; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 1251; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1252; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 1253; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1254; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1255; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1256; VI-NEXT: v_mov_b32_e32 v9, s1 1257; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 1258; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 1259; VI-NEXT: s_lshl_b32 s1, s4, 16 1260; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 1261; VI-NEXT: v_mov_b32_e32 v12, s1 1262; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 1263; VI-NEXT: s_waitcnt vmcnt(1) 1264; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1265; VI-NEXT: s_waitcnt vmcnt(0) 1266; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1267; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1268; VI-NEXT: s_endpgm 1269; 1270; GFX900-LABEL: v_insertelement_v16bf16_3: 1271; GFX900: ; %bb.0: 1272; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1273; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 1274; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 1275; GFX900-NEXT: v_mov_b32_e32 v9, 0x5040100 1276; GFX900-NEXT: s_waitcnt lgkmcnt(0) 1277; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 1278; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 1279; GFX900-NEXT: s_waitcnt vmcnt(1) 1280; GFX900-NEXT: v_perm_b32 v1, s4, v1, v9 1281; GFX900-NEXT: s_waitcnt vmcnt(0) 1282; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 1283; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 1284; GFX900-NEXT: s_endpgm 1285; 1286; GFX940-LABEL: v_insertelement_v16bf16_3: 1287; GFX940: ; %bb.0: 1288; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1289; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 1290; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1291; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 1292; GFX940-NEXT: v_mov_b32_e32 v9, 0x5040100 1293; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 1295; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 1296; GFX940-NEXT: s_waitcnt vmcnt(1) 1297; GFX940-NEXT: v_perm_b32 v1, s6, v1, v9 1298; GFX940-NEXT: s_waitcnt vmcnt(0) 1299; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 1300; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 1301; GFX940-NEXT: s_endpgm 1302 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1303 %tid.ext = sext i32 %tid to i64 1304 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 1305 %out.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 1306 %vec = load <16 x bfloat>, ptr addrspace(1) %in.gep 1307 %val.trunc = trunc i32 %val to i16 1308 %val.cvt = bitcast i16 %val.trunc to bfloat 1309 %vecins = insertelement <16 x bfloat> %vec, bfloat %val.cvt, i32 3 1310 store <16 x bfloat> %vecins, ptr addrspace(1) %out.gep 1311 ret void 1312} 1313 1314define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { 1315; SI-LABEL: v_insertelement_v16bf16_dynamic: 1316; SI: ; %bb.0: 1317; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 1318; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 1319; SI-NEXT: s_mov_b32 s3, 0x100f000 1320; SI-NEXT: s_mov_b32 s2, 0 1321; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 1322; SI-NEXT: s_waitcnt lgkmcnt(0) 1323; SI-NEXT: s_mov_b64 s[0:1], s[14:15] 1324; SI-NEXT: v_mov_b32_e32 v5, 0 1325; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 1326; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 1327; SI-NEXT: s_cmp_eq_u32 s7, 6 1328; SI-NEXT: v_mov_b32_e32 v6, s6 1329; SI-NEXT: s_cselect_b64 vcc, -1, 0 1330; SI-NEXT: s_cmp_eq_u32 s7, 7 1331; SI-NEXT: s_mov_b64 s[14:15], s[2:3] 1332; SI-NEXT: s_waitcnt vmcnt(1) 1333; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc 1334; SI-NEXT: s_cselect_b64 vcc, -1, 0 1335; SI-NEXT: s_cmp_eq_u32 s7, 4 1336; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 1337; SI-NEXT: s_cmp_eq_u32 s7, 5 1338; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 1339; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 1340; SI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] 1341; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 1342; SI-NEXT: s_cmp_eq_u32 s7, 2 1343; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 1344; SI-NEXT: s_cmp_eq_u32 s7, 3 1345; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc 1346; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 1347; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] 1348; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 1349; SI-NEXT: s_cmp_eq_u32 s7, 0 1350; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 1351; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 1352; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1] 1353; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1354; SI-NEXT: v_or_b32_e32 v10, v11, v10 1355; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 1356; SI-NEXT: v_cndmask_b32_e64 v12, v13, v6, s[2:3] 1357; SI-NEXT: s_cmp_eq_u32 s7, 1 1358; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 1359; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 1360; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 1361; SI-NEXT: s_cselect_b64 vcc, -1, 0 1362; SI-NEXT: s_cmp_eq_u32 s7, 14 1363; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] 1364; SI-NEXT: v_or_b32_e32 v8, v8, v12 1365; SI-NEXT: v_cndmask_b32_e32 v12, v14, v6, vcc 1366; SI-NEXT: s_cselect_b64 vcc, -1, 0 1367; SI-NEXT: s_cmp_eq_u32 s7, 15 1368; SI-NEXT: s_waitcnt vmcnt(0) 1369; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 1370; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 1371; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 1372; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1373; SI-NEXT: s_cselect_b64 vcc, -1, 0 1374; SI-NEXT: s_cmp_eq_u32 s7, 12 1375; SI-NEXT: v_or_b32_e32 v7, v7, v12 1376; SI-NEXT: v_cndmask_b32_e32 v12, v15, v6, vcc 1377; SI-NEXT: s_cselect_b64 vcc, -1, 0 1378; SI-NEXT: s_cmp_eq_u32 s7, 13 1379; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 1380; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 1381; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 1382; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1383; SI-NEXT: s_cselect_b64 vcc, -1, 0 1384; SI-NEXT: s_cmp_eq_u32 s7, 10 1385; SI-NEXT: v_or_b32_e32 v3, v3, v12 1386; SI-NEXT: v_cndmask_b32_e32 v12, v16, v6, vcc 1387; SI-NEXT: s_cselect_b64 vcc, -1, 0 1388; SI-NEXT: s_cmp_eq_u32 s7, 11 1389; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 1390; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 1391; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 1392; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1393; SI-NEXT: s_cselect_b64 vcc, -1, 0 1394; SI-NEXT: s_cmp_eq_u32 s7, 8 1395; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 1396; SI-NEXT: v_or_b32_e32 v2, v2, v12 1397; SI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc 1398; SI-NEXT: s_cselect_b64 vcc, -1, 0 1399; SI-NEXT: s_cmp_eq_u32 s7, 9 1400; SI-NEXT: v_or_b32_e32 v9, v9, v11 1401; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 1402; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1403; SI-NEXT: s_cselect_b64 vcc, -1, 0 1404; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc 1405; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1406; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 1407; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1408; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1409; SI-NEXT: v_or_b32_e32 v1, v1, v12 1410; SI-NEXT: v_or_b32_e32 v0, v0, v6 1411; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16 1412; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64 1413; SI-NEXT: s_endpgm 1414; 1415; VI-LABEL: v_insertelement_v16bf16_dynamic: 1416; VI: ; %bb.0: 1417; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1418; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 1419; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 1420; VI-NEXT: s_waitcnt lgkmcnt(0) 1421; VI-NEXT: v_mov_b32_e32 v0, s3 1422; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 1423; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc 1424; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 1425; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 1426; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1427; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1428; VI-NEXT: v_mov_b32_e32 v9, s1 1429; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 1430; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 1431; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 1432; VI-NEXT: s_cmp_eq_u32 s7, 14 1433; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 1434; VI-NEXT: v_mov_b32_e32 v12, s6 1435; VI-NEXT: s_cselect_b64 vcc, -1, 0 1436; VI-NEXT: s_cmp_eq_u32 s7, 15 1437; VI-NEXT: s_waitcnt vmcnt(1) 1438; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc 1439; VI-NEXT: s_cselect_b64 vcc, -1, 0 1440; VI-NEXT: s_cmp_eq_u32 s7, 12 1441; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 1442; VI-NEXT: s_cmp_eq_u32 s7, 13 1443; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 1444; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] 1445; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 1446; VI-NEXT: s_cmp_eq_u32 s7, 10 1447; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 1448; VI-NEXT: s_cmp_eq_u32 s7, 11 1449; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 1450; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] 1451; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 1452; VI-NEXT: s_cmp_eq_u32 s7, 8 1453; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1454; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 1455; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] 1456; VI-NEXT: s_cmp_eq_u32 s7, 9 1457; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 1458; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc 1459; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 1460; VI-NEXT: s_cselect_b64 vcc, -1, 0 1461; VI-NEXT: s_cmp_eq_u32 s7, 6 1462; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1463; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc 1464; VI-NEXT: s_cselect_b64 vcc, -1, 0 1465; VI-NEXT: s_cmp_eq_u32 s7, 7 1466; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] 1467; VI-NEXT: s_waitcnt vmcnt(0) 1468; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 1469; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1470; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] 1471; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 1472; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc 1473; VI-NEXT: s_cselect_b64 vcc, -1, 0 1474; VI-NEXT: s_cmp_eq_u32 s7, 4 1475; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1476; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 1477; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1478; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc 1479; VI-NEXT: s_cselect_b64 vcc, -1, 0 1480; VI-NEXT: s_cmp_eq_u32 s7, 5 1481; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1482; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 1483; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc 1484; VI-NEXT: s_cselect_b64 vcc, -1, 0 1485; VI-NEXT: s_cmp_eq_u32 s7, 2 1486; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc 1487; VI-NEXT: s_cselect_b64 vcc, -1, 0 1488; VI-NEXT: s_cmp_eq_u32 s7, 3 1489; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 1490; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc 1491; VI-NEXT: s_cselect_b64 vcc, -1, 0 1492; VI-NEXT: s_cmp_eq_u32 s7, 0 1493; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 1494; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc 1495; VI-NEXT: s_cselect_b64 vcc, -1, 0 1496; VI-NEXT: s_cmp_eq_u32 s7, 1 1497; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1498; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 1499; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc 1500; VI-NEXT: s_cselect_b64 vcc, -1, 0 1501; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc 1502; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 1503; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 1504; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 1505; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1506; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1507; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1508; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 1509; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] 1510; VI-NEXT: s_endpgm 1511; 1512; GFX900-LABEL: v_insertelement_v16bf16_dynamic: 1513; GFX900: ; %bb.0: 1514; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1515; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1516; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 1517; GFX900-NEXT: s_waitcnt lgkmcnt(0) 1518; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] 1519; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 1520; GFX900-NEXT: s_cmp_eq_u32 s5, 6 1521; GFX900-NEXT: v_mov_b32_e32 v9, s4 1522; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1523; GFX900-NEXT: s_cmp_eq_u32 s5, 7 1524; GFX900-NEXT: s_mov_b32 s2, 0x5040100 1525; GFX900-NEXT: s_waitcnt vmcnt(1) 1526; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc 1527; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 1528; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1529; GFX900-NEXT: s_cmp_eq_u32 s5, 4 1530; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc 1531; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1532; GFX900-NEXT: s_cmp_eq_u32 s5, 5 1533; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3 1534; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc 1535; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1536; GFX900-NEXT: s_cmp_eq_u32 s5, 2 1537; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2 1538; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc 1539; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1540; GFX900-NEXT: s_cmp_eq_u32 s5, 3 1541; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2 1542; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc 1543; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1544; GFX900-NEXT: s_cmp_eq_u32 s5, 0 1545; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc 1546; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1547; GFX900-NEXT: s_cmp_eq_u32 s5, 1 1548; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 1549; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 1550; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1551; GFX900-NEXT: s_cmp_eq_u32 s5, 14 1552; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc 1553; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1554; GFX900-NEXT: s_cmp_eq_u32 s5, 15 1555; GFX900-NEXT: s_waitcnt vmcnt(0) 1556; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8 1557; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 1558; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1559; GFX900-NEXT: s_cmp_eq_u32 s5, 12 1560; GFX900-NEXT: v_perm_b32 v1, v12, v1, s2 1561; GFX900-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc 1562; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1563; GFX900-NEXT: s_cmp_eq_u32 s5, 13 1564; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 1565; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1566; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1567; GFX900-NEXT: s_cmp_eq_u32 s5, 10 1568; GFX900-NEXT: v_perm_b32 v8, v12, v8, s2 1569; GFX900-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc 1570; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1571; GFX900-NEXT: s_cmp_eq_u32 s5, 11 1572; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 1573; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6 1574; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc 1575; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1576; GFX900-NEXT: s_cmp_eq_u32 s5, 8 1577; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc 1578; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1579; GFX900-NEXT: s_cmp_eq_u32 s5, 9 1580; GFX900-NEXT: v_perm_b32 v2, v11, v2, s2 1581; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5 1582; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc 1583; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 1584; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc 1585; GFX900-NEXT: v_perm_b32 v7, v12, v7, s2 1586; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2 1587; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2 1588; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 1589; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] 1590; GFX900-NEXT: s_endpgm 1591; 1592; GFX940-LABEL: v_insertelement_v16bf16_dynamic: 1593; GFX940: ; %bb.0: 1594; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1595; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1596; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1597; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 1598; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1599; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 1600; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 1601; GFX940-NEXT: s_cmp_eq_u32 s7, 6 1602; GFX940-NEXT: v_mov_b32_e32 v9, s6 1603; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1604; GFX940-NEXT: s_cmp_eq_u32 s7, 7 1605; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1606; GFX940-NEXT: s_waitcnt vmcnt(1) 1607; GFX940-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc 1608; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1609; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1610; GFX940-NEXT: s_cmp_eq_u32 s7, 4 1611; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc 1612; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1613; GFX940-NEXT: s_cmp_eq_u32 s7, 5 1614; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v2 1615; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc 1616; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1617; GFX940-NEXT: s_cmp_eq_u32 s7, 2 1618; GFX940-NEXT: v_perm_b32 v3, v3, v10, s2 1619; GFX940-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc 1620; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1621; GFX940-NEXT: s_cmp_eq_u32 s7, 3 1622; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v1 1623; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 1624; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1625; GFX940-NEXT: s_cmp_eq_u32 s7, 0 1626; GFX940-NEXT: v_perm_b32 v2, v10, v2, s2 1627; GFX940-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc 1628; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1629; GFX940-NEXT: s_cmp_eq_u32 s7, 1 1630; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v0 1631; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc 1632; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1633; GFX940-NEXT: s_cmp_eq_u32 s7, 14 1634; GFX940-NEXT: v_perm_b32 v1, v10, v1, s2 1635; GFX940-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc 1636; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1637; GFX940-NEXT: s_cmp_eq_u32 s7, 15 1638; GFX940-NEXT: s_waitcnt vmcnt(0) 1639; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v7 1640; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1641; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1642; GFX940-NEXT: s_cmp_eq_u32 s7, 12 1643; GFX940-NEXT: v_perm_b32 v0, v10, v0, s2 1644; GFX940-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc 1645; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1646; GFX940-NEXT: s_cmp_eq_u32 s7, 13 1647; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v6 1648; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc 1649; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1650; GFX940-NEXT: s_cmp_eq_u32 s7, 10 1651; GFX940-NEXT: v_perm_b32 v7, v10, v7, s2 1652; GFX940-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc 1653; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1654; GFX940-NEXT: s_cmp_eq_u32 s7, 11 1655; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v5 1656; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc 1657; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1658; GFX940-NEXT: s_cmp_eq_u32 s7, 8 1659; GFX940-NEXT: v_perm_b32 v6, v10, v6, s2 1660; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc 1661; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1662; GFX940-NEXT: s_cmp_eq_u32 s7, 9 1663; GFX940-NEXT: v_lshrrev_b32_e32 v17, 16, v4 1664; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc 1665; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 1666; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc 1667; GFX940-NEXT: v_perm_b32 v5, v10, v5, s2 1668; GFX940-NEXT: v_perm_b32 v4, v9, v4, s2 1669; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 1670; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 1671; GFX940-NEXT: s_endpgm 1672 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1673 %tid.ext = sext i32 %tid to i64 1674 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext 1675 %out.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext 1676 %vec = load <16 x bfloat>, ptr addrspace(1) %in.gep 1677 %val.trunc = trunc i32 %val to i16 1678 %val.cvt = bitcast i16 %val.trunc to bfloat 1679 %vecins = insertelement <16 x bfloat> %vec, bfloat %val.cvt, i32 %n 1680 store <16 x bfloat> %vecins, ptr addrspace(1) %out.gep 1681 ret void 1682} 1683 1684declare i32 @llvm.amdgcn.workitem.id.x() #1 1685 1686attributes #0 = { nounwind } 1687attributes #1 = { nounwind readnone } 1688