1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4 5; FIXME: Broken on evergreen 6; FIXME: For some reason the 8 and 16 vectors are being stored as 7; individual elements instead of 128-bit stores. 8 9define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind { 10; SI-LABEL: insertelement_v2f32_0: 11; SI: ; %bb.0: 12; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 13; SI-NEXT: s_mov_b32 s7, 0x100f000 14; SI-NEXT: s_mov_b32 s6, -1 15; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s4, s0 18; SI-NEXT: s_mov_b32 s5, s1 19; SI-NEXT: v_mov_b32_e32 v1, s3 20; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 21; SI-NEXT: s_endpgm 22; 23; VI-LABEL: insertelement_v2f32_0: 24; VI: ; %bb.0: 25; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 26; VI-NEXT: s_mov_b32 s7, 0x1100f000 27; VI-NEXT: s_mov_b32 s6, -1 28; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: s_mov_b32 s4, s0 31; VI-NEXT: s_mov_b32 s5, s1 32; VI-NEXT: v_mov_b32_e32 v1, s3 33; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 34; VI-NEXT: s_endpgm 35 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0 36 store <2 x float> %vecins, ptr addrspace(1) %out, align 16 37 ret void 38} 39 40define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind { 41; SI-LABEL: insertelement_v2f32_1: 42; SI: ; %bb.0: 43; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 44; SI-NEXT: s_mov_b32 s7, 0x100f000 45; SI-NEXT: s_mov_b32 s6, -1 46; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 47; SI-NEXT: s_waitcnt lgkmcnt(0) 48; SI-NEXT: s_mov_b32 s4, s0 49; SI-NEXT: s_mov_b32 s5, s1 50; SI-NEXT: v_mov_b32_e32 v0, s2 51; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 52; SI-NEXT: s_endpgm 53; 54; VI-LABEL: insertelement_v2f32_1: 55; VI: ; %bb.0: 56; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 57; VI-NEXT: s_mov_b32 s7, 0x1100f000 58; VI-NEXT: s_mov_b32 s6, -1 59; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 60; VI-NEXT: s_waitcnt lgkmcnt(0) 61; VI-NEXT: s_mov_b32 s4, s0 62; VI-NEXT: s_mov_b32 s5, s1 63; VI-NEXT: v_mov_b32_e32 v0, s2 64; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 65; VI-NEXT: s_endpgm 66 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1 67 store <2 x float> %vecins, ptr addrspace(1) %out, align 16 68 ret void 69} 70 71define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind { 72; SI-LABEL: insertelement_v2i32_0: 73; SI: ; %bb.0: 74; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 75; SI-NEXT: s_mov_b32 s7, 0x100f000 76; SI-NEXT: s_mov_b32 s6, -1 77; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 78; SI-NEXT: s_waitcnt lgkmcnt(0) 79; SI-NEXT: s_mov_b32 s4, s0 80; SI-NEXT: s_mov_b32 s5, s1 81; SI-NEXT: v_mov_b32_e32 v1, s3 82; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 83; SI-NEXT: s_endpgm 84; 85; VI-LABEL: insertelement_v2i32_0: 86; VI: ; %bb.0: 87; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 88; VI-NEXT: s_mov_b32 s7, 0x1100f000 89; VI-NEXT: s_mov_b32 s6, -1 90; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 91; VI-NEXT: s_waitcnt lgkmcnt(0) 92; VI-NEXT: s_mov_b32 s4, s0 93; VI-NEXT: s_mov_b32 s5, s1 94; VI-NEXT: v_mov_b32_e32 v1, s3 95; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 96; VI-NEXT: s_endpgm 97 %vecins = insertelement <2 x i32> %a, i32 999, i32 0 98 store <2 x i32> %vecins, ptr addrspace(1) %out, align 16 99 ret void 100} 101 102define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind { 103; SI-LABEL: insertelement_v2i32_1: 104; SI: ; %bb.0: 105; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 106; SI-NEXT: s_mov_b32 s7, 0x100f000 107; SI-NEXT: s_mov_b32 s6, -1 108; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 109; SI-NEXT: s_waitcnt lgkmcnt(0) 110; SI-NEXT: s_mov_b32 s4, s0 111; SI-NEXT: s_mov_b32 s5, s1 112; SI-NEXT: v_mov_b32_e32 v0, s2 113; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 114; SI-NEXT: s_endpgm 115; 116; VI-LABEL: insertelement_v2i32_1: 117; VI: ; %bb.0: 118; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 119; VI-NEXT: s_mov_b32 s7, 0x1100f000 120; VI-NEXT: s_mov_b32 s6, -1 121; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 122; VI-NEXT: s_waitcnt lgkmcnt(0) 123; VI-NEXT: s_mov_b32 s4, s0 124; VI-NEXT: s_mov_b32 s5, s1 125; VI-NEXT: v_mov_b32_e32 v0, s2 126; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 127; VI-NEXT: s_endpgm 128 %vecins = insertelement <2 x i32> %a, i32 999, i32 1 129 store <2 x i32> %vecins, ptr addrspace(1) %out, align 16 130 ret void 131} 132 133; FIXME: Why is the constant moved into the intermediate register and 134; not just directly into the vector component? 135define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind { 136; SI-LABEL: insertelement_v4f32_0: 137; SI: ; %bb.0: 138; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 139; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 140; SI-NEXT: s_waitcnt lgkmcnt(0) 141; SI-NEXT: s_mov_b32 s0, 0x40a00000 142; SI-NEXT: s_mov_b32 s7, 0x100f000 143; SI-NEXT: s_mov_b32 s6, -1 144; SI-NEXT: v_mov_b32_e32 v0, s0 145; SI-NEXT: v_mov_b32_e32 v1, s1 146; SI-NEXT: v_mov_b32_e32 v2, s2 147; SI-NEXT: v_mov_b32_e32 v3, s3 148; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 149; SI-NEXT: s_endpgm 150; 151; VI-LABEL: insertelement_v4f32_0: 152; VI: ; %bb.0: 153; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 154; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 155; VI-NEXT: s_waitcnt lgkmcnt(0) 156; VI-NEXT: s_mov_b32 s0, 0x40a00000 157; VI-NEXT: s_mov_b32 s7, 0x1100f000 158; VI-NEXT: s_mov_b32 s6, -1 159; VI-NEXT: v_mov_b32_e32 v0, s0 160; VI-NEXT: v_mov_b32_e32 v1, s1 161; VI-NEXT: v_mov_b32_e32 v2, s2 162; VI-NEXT: v_mov_b32_e32 v3, s3 163; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 164; VI-NEXT: s_endpgm 165 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 166 store <4 x float> %vecins, ptr addrspace(1) %out, align 16 167 ret void 168} 169 170define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind { 171; SI-LABEL: insertelement_v4f32_1: 172; SI: ; %bb.0: 173; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 174; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 175; SI-NEXT: s_waitcnt lgkmcnt(0) 176; SI-NEXT: s_mov_b32 s1, 0x40a00000 177; SI-NEXT: s_mov_b32 s7, 0x100f000 178; SI-NEXT: s_mov_b32 s6, -1 179; SI-NEXT: v_mov_b32_e32 v0, s0 180; SI-NEXT: v_mov_b32_e32 v1, s1 181; SI-NEXT: v_mov_b32_e32 v2, s2 182; SI-NEXT: v_mov_b32_e32 v3, s3 183; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 184; SI-NEXT: s_endpgm 185; 186; VI-LABEL: insertelement_v4f32_1: 187; VI: ; %bb.0: 188; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 189; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 190; VI-NEXT: s_waitcnt lgkmcnt(0) 191; VI-NEXT: s_mov_b32 s1, 0x40a00000 192; VI-NEXT: s_mov_b32 s7, 0x1100f000 193; VI-NEXT: s_mov_b32 s6, -1 194; VI-NEXT: v_mov_b32_e32 v0, s0 195; VI-NEXT: v_mov_b32_e32 v1, s1 196; VI-NEXT: v_mov_b32_e32 v2, s2 197; VI-NEXT: v_mov_b32_e32 v3, s3 198; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 199; VI-NEXT: s_endpgm 200 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 201 store <4 x float> %vecins, ptr addrspace(1) %out, align 16 202 ret void 203} 204 205define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind { 206; SI-LABEL: insertelement_v4f32_2: 207; SI: ; %bb.0: 208; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 209; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 210; SI-NEXT: s_waitcnt lgkmcnt(0) 211; SI-NEXT: s_mov_b32 s2, 0x40a00000 212; SI-NEXT: s_mov_b32 s7, 0x100f000 213; SI-NEXT: s_mov_b32 s6, -1 214; SI-NEXT: v_mov_b32_e32 v0, s0 215; SI-NEXT: v_mov_b32_e32 v1, s1 216; SI-NEXT: v_mov_b32_e32 v2, s2 217; SI-NEXT: v_mov_b32_e32 v3, s3 218; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 219; SI-NEXT: s_endpgm 220; 221; VI-LABEL: insertelement_v4f32_2: 222; VI: ; %bb.0: 223; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 224; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 225; VI-NEXT: s_waitcnt lgkmcnt(0) 226; VI-NEXT: s_mov_b32 s2, 0x40a00000 227; VI-NEXT: s_mov_b32 s7, 0x1100f000 228; VI-NEXT: s_mov_b32 s6, -1 229; VI-NEXT: v_mov_b32_e32 v0, s0 230; VI-NEXT: v_mov_b32_e32 v1, s1 231; VI-NEXT: v_mov_b32_e32 v2, s2 232; VI-NEXT: v_mov_b32_e32 v3, s3 233; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 234; VI-NEXT: s_endpgm 235 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 236 store <4 x float> %vecins, ptr addrspace(1) %out, align 16 237 ret void 238} 239 240define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind { 241; SI-LABEL: insertelement_v4f32_3: 242; SI: ; %bb.0: 243; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 244; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 245; SI-NEXT: s_waitcnt lgkmcnt(0) 246; SI-NEXT: s_mov_b32 s3, 0x40a00000 247; SI-NEXT: s_mov_b32 s7, 0x100f000 248; SI-NEXT: s_mov_b32 s6, -1 249; SI-NEXT: v_mov_b32_e32 v0, s0 250; SI-NEXT: v_mov_b32_e32 v1, s1 251; SI-NEXT: v_mov_b32_e32 v2, s2 252; SI-NEXT: v_mov_b32_e32 v3, s3 253; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 254; SI-NEXT: s_endpgm 255; 256; VI-LABEL: insertelement_v4f32_3: 257; VI: ; %bb.0: 258; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 259; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 260; VI-NEXT: s_waitcnt lgkmcnt(0) 261; VI-NEXT: s_mov_b32 s3, 0x40a00000 262; VI-NEXT: s_mov_b32 s7, 0x1100f000 263; VI-NEXT: s_mov_b32 s6, -1 264; VI-NEXT: v_mov_b32_e32 v0, s0 265; VI-NEXT: v_mov_b32_e32 v1, s1 266; VI-NEXT: v_mov_b32_e32 v2, s2 267; VI-NEXT: v_mov_b32_e32 v3, s3 268; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 269; VI-NEXT: s_endpgm 270 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 271 store <4 x float> %vecins, ptr addrspace(1) %out, align 16 272 ret void 273} 274 275define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind { 276; SI-LABEL: insertelement_v4i32_0: 277; SI: ; %bb.0: 278; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 279; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 280; SI-NEXT: s_waitcnt lgkmcnt(0) 281; SI-NEXT: s_movk_i32 s0, 0x3e7 282; SI-NEXT: s_mov_b32 s7, 0x100f000 283; SI-NEXT: s_mov_b32 s6, -1 284; SI-NEXT: v_mov_b32_e32 v0, s0 285; SI-NEXT: v_mov_b32_e32 v1, s1 286; SI-NEXT: v_mov_b32_e32 v2, s2 287; SI-NEXT: v_mov_b32_e32 v3, s3 288; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 289; SI-NEXT: s_endpgm 290; 291; VI-LABEL: insertelement_v4i32_0: 292; VI: ; %bb.0: 293; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 294; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 295; VI-NEXT: s_waitcnt lgkmcnt(0) 296; VI-NEXT: s_movk_i32 s0, 0x3e7 297; VI-NEXT: s_mov_b32 s7, 0x1100f000 298; VI-NEXT: s_mov_b32 s6, -1 299; VI-NEXT: v_mov_b32_e32 v0, s0 300; VI-NEXT: v_mov_b32_e32 v1, s1 301; VI-NEXT: v_mov_b32_e32 v2, s2 302; VI-NEXT: v_mov_b32_e32 v3, s3 303; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 304; VI-NEXT: s_endpgm 305 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 306 store <4 x i32> %vecins, ptr addrspace(1) %out, align 16 307 ret void 308} 309 310define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind { 311; SI-LABEL: insertelement_v3f32_1: 312; SI: ; %bb.0: 313; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 314; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 315; SI-NEXT: s_mov_b32 s7, 0x100f000 316; SI-NEXT: s_mov_b32 s6, -1 317; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 318; SI-NEXT: s_waitcnt lgkmcnt(0) 319; SI-NEXT: v_mov_b32_e32 v0, s0 320; SI-NEXT: v_mov_b32_e32 v2, s2 321; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 322; SI-NEXT: s_endpgm 323; 324; VI-LABEL: insertelement_v3f32_1: 325; VI: ; %bb.0: 326; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 327; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 328; VI-NEXT: s_mov_b32 s7, 0x1100f000 329; VI-NEXT: s_mov_b32 s6, -1 330; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 331; VI-NEXT: s_waitcnt lgkmcnt(0) 332; VI-NEXT: v_mov_b32_e32 v0, s0 333; VI-NEXT: v_mov_b32_e32 v2, s2 334; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 335; VI-NEXT: s_endpgm 336 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 337 store <3 x float> %vecins, ptr addrspace(1) %out, align 16 338 ret void 339} 340 341define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind { 342; SI-LABEL: insertelement_v3f32_2: 343; SI: ; %bb.0: 344; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 345; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 346; SI-NEXT: s_mov_b32 s7, 0x100f000 347; SI-NEXT: s_mov_b32 s6, -1 348; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 349; SI-NEXT: s_waitcnt lgkmcnt(0) 350; SI-NEXT: v_mov_b32_e32 v0, s0 351; SI-NEXT: v_mov_b32_e32 v1, s1 352; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 353; SI-NEXT: s_endpgm 354; 355; VI-LABEL: insertelement_v3f32_2: 356; VI: ; %bb.0: 357; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 358; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 359; VI-NEXT: s_mov_b32 s7, 0x1100f000 360; VI-NEXT: s_mov_b32 s6, -1 361; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 362; VI-NEXT: s_waitcnt lgkmcnt(0) 363; VI-NEXT: v_mov_b32_e32 v0, s0 364; VI-NEXT: v_mov_b32_e32 v1, s1 365; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 366; VI-NEXT: s_endpgm 367 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 368 store <3 x float> %vecins, ptr addrspace(1) %out, align 16 369 ret void 370} 371 372define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind { 373; GCN-LABEL: insertelement_v3f32_3: 374; GCN: ; %bb.0: 375; GCN-NEXT: s_endpgm 376 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 377 store <3 x float> %vecins, ptr addrspace(1) %out, align 16 378 ret void 379} 380 381define <4 x float> @insertelement_to_sgpr() nounwind { 382; GCN-LABEL: insertelement_to_sgpr: 383; GCN: ; %bb.0: 384; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 385; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 386; GCN-NEXT: s_waitcnt lgkmcnt(0) 387; GCN-NEXT: s_mov_b32 s4, 0 388; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1 389; GCN-NEXT: s_waitcnt vmcnt(0) 390; GCN-NEXT: s_setpc_b64 s[30:31] 391 %tmp = load <4 x i32>, ptr addrspace(4) undef 392 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 393 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 394 ret <4 x float> %tmp2 395} 396 397define <9 x float> @insertelement_to_v9f32_undef() nounwind { 398; GCN-LABEL: insertelement_to_v9f32_undef: 399; GCN: ; %bb.0: 400; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 401; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 402; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000 403; GCN-NEXT: v_mov_b32_e32 v2, 0xc0a00000 404; GCN-NEXT: v_mov_b32_e32 v7, 0x41880000 405; GCN-NEXT: s_waitcnt lgkmcnt(0) 406; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 407; GCN-NEXT: v_mov_b32_e32 v1, s5 408; GCN-NEXT: v_mov_b32_e32 v3, s7 409; GCN-NEXT: v_mov_b32_e32 v4, s8 410; GCN-NEXT: v_mov_b32_e32 v5, s9 411; GCN-NEXT: v_mov_b32_e32 v6, s10 412; GCN-NEXT: s_waitcnt lgkmcnt(0) 413; GCN-NEXT: v_mov_b32_e32 v8, s4 414; GCN-NEXT: s_setpc_b64 s[30:31] 415 %tmp = load <9 x float>, ptr addrspace(4) undef 416 %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0 417 %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2 418 %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7 419 ret <9 x float> %tmp3 420} 421 422define <10 x float> @insertelement_to_v10f32_undef() nounwind { 423; GCN-LABEL: insertelement_to_v10f32_undef: 424; GCN: ; %bb.0: 425; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 426; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 427; GCN-NEXT: v_mov_b32_e32 v0, 2.0 428; GCN-NEXT: s_waitcnt lgkmcnt(0) 429; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 430; GCN-NEXT: v_mov_b32_e32 v1, s5 431; GCN-NEXT: v_mov_b32_e32 v2, s6 432; GCN-NEXT: v_mov_b32_e32 v3, s7 433; GCN-NEXT: v_mov_b32_e32 v4, s8 434; GCN-NEXT: v_mov_b32_e32 v5, s9 435; GCN-NEXT: v_mov_b32_e32 v6, s10 436; GCN-NEXT: v_mov_b32_e32 v7, s11 437; GCN-NEXT: s_waitcnt lgkmcnt(0) 438; GCN-NEXT: v_mov_b32_e32 v8, s12 439; GCN-NEXT: v_mov_b32_e32 v9, s13 440; GCN-NEXT: s_setpc_b64 s[30:31] 441 %tmp = load <10 x float>, ptr addrspace(4) undef 442 %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0 443 ret <10 x float> %tmp1 444} 445 446define <11 x float> @insertelement_to_v11f32_undef() nounwind { 447; GCN-LABEL: insertelement_to_v11f32_undef: 448; GCN: ; %bb.0: 449; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 450; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 451; GCN-NEXT: v_mov_b32_e32 v0, 1.0 452; GCN-NEXT: s_waitcnt lgkmcnt(0) 453; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 454; GCN-NEXT: v_mov_b32_e32 v1, s5 455; GCN-NEXT: v_mov_b32_e32 v2, s6 456; GCN-NEXT: v_mov_b32_e32 v3, s7 457; GCN-NEXT: v_mov_b32_e32 v4, s8 458; GCN-NEXT: v_mov_b32_e32 v5, s9 459; GCN-NEXT: v_mov_b32_e32 v6, s10 460; GCN-NEXT: v_mov_b32_e32 v7, s11 461; GCN-NEXT: s_waitcnt lgkmcnt(0) 462; GCN-NEXT: v_mov_b32_e32 v8, s12 463; GCN-NEXT: v_mov_b32_e32 v9, s13 464; GCN-NEXT: v_mov_b32_e32 v10, s14 465; GCN-NEXT: s_setpc_b64 s[30:31] 466 %tmp = load <11 x float>, ptr addrspace(4) undef 467 %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0 468 ret <11 x float> %tmp1 469} 470 471define <12 x float> @insertelement_to_v12f32_undef() nounwind { 472; GCN-LABEL: insertelement_to_v12f32_undef: 473; GCN: ; %bb.0: 474; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 475; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 476; GCN-NEXT: v_mov_b32_e32 v0, 4.0 477; GCN-NEXT: s_waitcnt lgkmcnt(0) 478; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 479; GCN-NEXT: v_mov_b32_e32 v1, s5 480; GCN-NEXT: v_mov_b32_e32 v2, s6 481; GCN-NEXT: v_mov_b32_e32 v3, s7 482; GCN-NEXT: v_mov_b32_e32 v4, s8 483; GCN-NEXT: v_mov_b32_e32 v5, s9 484; GCN-NEXT: v_mov_b32_e32 v6, s10 485; GCN-NEXT: v_mov_b32_e32 v7, s11 486; GCN-NEXT: s_waitcnt lgkmcnt(0) 487; GCN-NEXT: v_mov_b32_e32 v8, s12 488; GCN-NEXT: v_mov_b32_e32 v9, s13 489; GCN-NEXT: v_mov_b32_e32 v10, s14 490; GCN-NEXT: v_mov_b32_e32 v11, s15 491; GCN-NEXT: s_setpc_b64 s[30:31] 492 %tmp = load <12 x float>, ptr addrspace(4) undef 493 %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0 494 ret <12 x float> %tmp1 495} 496 497define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { 498; SI-LABEL: dynamic_insertelement_v2f32: 499; SI: ; %bb.0: 500; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 501; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 502; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 503; SI-NEXT: s_mov_b32 s7, 0x100f000 504; SI-NEXT: s_mov_b32 s6, -1 505; SI-NEXT: s_waitcnt lgkmcnt(0) 506; SI-NEXT: s_cmp_lg_u32 s2, 1 507; SI-NEXT: v_mov_b32_e32 v1, s1 508; SI-NEXT: s_cselect_b64 vcc, -1, 0 509; SI-NEXT: s_cmp_lg_u32 s2, 0 510; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 511; SI-NEXT: v_mov_b32_e32 v2, s0 512; SI-NEXT: s_cselect_b64 vcc, -1, 0 513; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 514; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 515; SI-NEXT: s_endpgm 516; 517; VI-LABEL: dynamic_insertelement_v2f32: 518; VI: ; %bb.0: 519; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 520; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 521; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 522; VI-NEXT: s_mov_b32 s7, 0x1100f000 523; VI-NEXT: s_mov_b32 s6, -1 524; VI-NEXT: s_waitcnt lgkmcnt(0) 525; VI-NEXT: s_cmp_lg_u32 s2, 1 526; VI-NEXT: v_mov_b32_e32 v1, s1 527; VI-NEXT: s_cselect_b64 vcc, -1, 0 528; VI-NEXT: s_cmp_lg_u32 s2, 0 529; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 530; VI-NEXT: v_mov_b32_e32 v2, s0 531; VI-NEXT: s_cselect_b64 vcc, -1, 0 532; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 533; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 534; VI-NEXT: s_endpgm 535 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 536 store <2 x float> %vecins, ptr addrspace(1) %out, align 8 537 ret void 538} 539 540define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind { 541; SI-LABEL: dynamic_insertelement_v3f32: 542; SI: ; %bb.0: 543; SI-NEXT: s_load_dword s10, s[8:9], 0x8 544; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 545; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4 546; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 547; SI-NEXT: s_mov_b32 s3, 0x100f000 548; SI-NEXT: s_waitcnt lgkmcnt(0) 549; SI-NEXT: s_cmp_lg_u32 s10, 2 550; SI-NEXT: s_cselect_b64 vcc, -1, 0 551; SI-NEXT: v_mov_b32_e32 v1, s6 552; SI-NEXT: s_cmp_lg_u32 s10, 1 553; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 554; SI-NEXT: v_mov_b32_e32 v1, s5 555; SI-NEXT: s_cselect_b64 vcc, -1, 0 556; SI-NEXT: s_cmp_lg_u32 s10, 0 557; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 558; SI-NEXT: v_mov_b32_e32 v3, s4 559; SI-NEXT: s_cselect_b64 vcc, -1, 0 560; SI-NEXT: s_mov_b32 s2, -1 561; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 562; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 563; SI-NEXT: s_endpgm 564; 565; VI-LABEL: dynamic_insertelement_v3f32: 566; VI: ; %bb.0: 567; VI-NEXT: s_load_dword s10, s[8:9], 0x20 568; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 569; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10 570; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 571; VI-NEXT: s_mov_b32 s3, 0x1100f000 572; VI-NEXT: s_waitcnt lgkmcnt(0) 573; VI-NEXT: s_cmp_lg_u32 s10, 2 574; VI-NEXT: s_cselect_b64 vcc, -1, 0 575; VI-NEXT: v_mov_b32_e32 v1, s6 576; VI-NEXT: s_cmp_lg_u32 s10, 1 577; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 578; VI-NEXT: v_mov_b32_e32 v1, s5 579; VI-NEXT: s_cselect_b64 vcc, -1, 0 580; VI-NEXT: s_cmp_lg_u32 s10, 0 581; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 582; VI-NEXT: v_mov_b32_e32 v3, s4 583; VI-NEXT: s_cselect_b64 vcc, -1, 0 584; VI-NEXT: s_mov_b32 s2, -1 585; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 586; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 587; VI-NEXT: s_endpgm 588 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 589 store <3 x float> %vecins, ptr addrspace(1) %out, align 16 590 ret void 591} 592 593define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind { 594; SI-LABEL: dynamic_insertelement_v4f32: 595; SI: ; %bb.0: 596; SI-NEXT: s_load_dword s10, s[8:9], 0x8 597; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 598; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4 599; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 600; SI-NEXT: s_mov_b32 s3, 0x100f000 601; SI-NEXT: s_waitcnt lgkmcnt(0) 602; SI-NEXT: s_cmp_lg_u32 s10, 3 603; SI-NEXT: s_cselect_b64 vcc, -1, 0 604; SI-NEXT: v_mov_b32_e32 v1, s7 605; SI-NEXT: s_cmp_lg_u32 s10, 2 606; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 607; SI-NEXT: v_mov_b32_e32 v1, s6 608; SI-NEXT: s_cselect_b64 vcc, -1, 0 609; SI-NEXT: s_cmp_lg_u32 s10, 1 610; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 611; SI-NEXT: v_mov_b32_e32 v1, s5 612; SI-NEXT: s_cselect_b64 vcc, -1, 0 613; SI-NEXT: s_cmp_lg_u32 s10, 0 614; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 615; SI-NEXT: v_mov_b32_e32 v4, s4 616; SI-NEXT: s_cselect_b64 vcc, -1, 0 617; SI-NEXT: s_mov_b32 s2, -1 618; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 619; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 620; SI-NEXT: s_endpgm 621; 622; VI-LABEL: dynamic_insertelement_v4f32: 623; VI: ; %bb.0: 624; VI-NEXT: s_load_dword s10, s[8:9], 0x20 625; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 626; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10 627; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 628; VI-NEXT: s_mov_b32 s3, 0x1100f000 629; VI-NEXT: s_waitcnt lgkmcnt(0) 630; VI-NEXT: s_cmp_lg_u32 s10, 3 631; VI-NEXT: s_cselect_b64 vcc, -1, 0 632; VI-NEXT: v_mov_b32_e32 v1, s7 633; VI-NEXT: s_cmp_lg_u32 s10, 2 634; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 635; VI-NEXT: v_mov_b32_e32 v1, s6 636; VI-NEXT: s_cselect_b64 vcc, -1, 0 637; VI-NEXT: s_cmp_lg_u32 s10, 1 638; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 639; VI-NEXT: v_mov_b32_e32 v1, s5 640; VI-NEXT: s_cselect_b64 vcc, -1, 0 641; VI-NEXT: s_cmp_lg_u32 s10, 0 642; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 643; VI-NEXT: v_mov_b32_e32 v4, s4 644; VI-NEXT: s_cselect_b64 vcc, -1, 0 645; VI-NEXT: s_mov_b32 s2, -1 646; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 647; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 648; VI-NEXT: s_endpgm 649 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 650 store <4 x float> %vecins, ptr addrspace(1) %out, align 16 651 ret void 652} 653 654define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { 655; SI-LABEL: dynamic_insertelement_v8f32: 656; SI: ; %bb.0: 657; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 658; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 659; SI-NEXT: s_load_dword s8, s[8:9], 0x10 660; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 661; SI-NEXT: s_mov_b32 s15, 0x100f000 662; SI-NEXT: s_mov_b32 s14, -1 663; SI-NEXT: s_waitcnt lgkmcnt(0) 664; SI-NEXT: v_mov_b32_e32 v0, s0 665; SI-NEXT: v_mov_b32_e32 v1, s1 666; SI-NEXT: v_mov_b32_e32 v2, s2 667; SI-NEXT: v_mov_b32_e32 v3, s3 668; SI-NEXT: v_mov_b32_e32 v4, s4 669; SI-NEXT: v_mov_b32_e32 v5, s5 670; SI-NEXT: v_mov_b32_e32 v6, s6 671; SI-NEXT: v_mov_b32_e32 v7, s7 672; SI-NEXT: s_mov_b32 m0, s8 673; SI-NEXT: v_movreld_b32_e32 v0, v8 674; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 675; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 676; SI-NEXT: s_endpgm 677; 678; VI-LABEL: dynamic_insertelement_v8f32: 679; VI: ; %bb.0: 680; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20 681; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 682; VI-NEXT: s_load_dword s4, s[8:9], 0x40 683; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 684; VI-NEXT: s_mov_b32 s3, 0x1100f000 685; VI-NEXT: s_mov_b32 s2, -1 686; VI-NEXT: s_waitcnt lgkmcnt(0) 687; VI-NEXT: v_mov_b32_e32 v0, s12 688; VI-NEXT: v_mov_b32_e32 v1, s13 689; VI-NEXT: v_mov_b32_e32 v2, s14 690; VI-NEXT: v_mov_b32_e32 v3, s15 691; VI-NEXT: v_mov_b32_e32 v4, s16 692; VI-NEXT: v_mov_b32_e32 v5, s17 693; VI-NEXT: v_mov_b32_e32 v6, s18 694; VI-NEXT: v_mov_b32_e32 v7, s19 695; VI-NEXT: s_mov_b32 m0, s4 696; VI-NEXT: v_movreld_b32_e32 v0, v8 697; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 698; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 699; VI-NEXT: s_endpgm 700 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 701 store <8 x float> %vecins, ptr addrspace(1) %out, align 32 702 ret void 703} 704 705define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind { 706; SI-LABEL: dynamic_insertelement_v9f32: 707; SI: ; %bb.0: 708; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 709; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 710; SI-NEXT: s_load_dword s4, s[8:9], 0x18 711; SI-NEXT: s_load_dword s5, s[8:9], 0x20 712; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000 713; SI-NEXT: s_mov_b32 s3, 0x100f000 714; SI-NEXT: s_waitcnt lgkmcnt(0) 715; SI-NEXT: v_mov_b32_e32 v0, s12 716; SI-NEXT: v_mov_b32_e32 v1, s13 717; SI-NEXT: v_mov_b32_e32 v2, s14 718; SI-NEXT: v_mov_b32_e32 v3, s15 719; SI-NEXT: v_mov_b32_e32 v4, s16 720; SI-NEXT: v_mov_b32_e32 v5, s17 721; SI-NEXT: v_mov_b32_e32 v6, s18 722; SI-NEXT: v_mov_b32_e32 v7, s19 723; SI-NEXT: v_mov_b32_e32 v8, s4 724; SI-NEXT: s_mov_b32 m0, s5 725; SI-NEXT: s_mov_b32 s2, -1 726; SI-NEXT: v_movreld_b32_e32 v0, v9 727; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 728; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 729; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 730; SI-NEXT: s_endpgm 731; 732; VI-LABEL: dynamic_insertelement_v9f32: 733; VI: ; %bb.0: 734; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 735; VI-NEXT: s_load_dword s4, s[8:9], 0x60 736; VI-NEXT: s_load_dword s5, s[8:9], 0x80 737; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 738; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000 739; VI-NEXT: s_waitcnt lgkmcnt(0) 740; VI-NEXT: v_mov_b32_e32 v0, s12 741; VI-NEXT: v_mov_b32_e32 v1, s13 742; VI-NEXT: v_mov_b32_e32 v2, s14 743; VI-NEXT: v_mov_b32_e32 v3, s15 744; VI-NEXT: v_mov_b32_e32 v4, s16 745; VI-NEXT: v_mov_b32_e32 v5, s17 746; VI-NEXT: v_mov_b32_e32 v6, s18 747; VI-NEXT: v_mov_b32_e32 v7, s19 748; VI-NEXT: v_mov_b32_e32 v8, s4 749; VI-NEXT: s_mov_b32 m0, s5 750; VI-NEXT: s_mov_b32 s3, 0x1100f000 751; VI-NEXT: s_mov_b32 s2, -1 752; VI-NEXT: v_movreld_b32_e32 v0, v9 753; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 754; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 755; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 756; VI-NEXT: s_endpgm 757 %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b 758 store <9 x float> %vecins, ptr addrspace(1) %out, align 32 759 ret void 760} 761 762define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind { 763; SI-LABEL: dynamic_insertelement_v10f32: 764; SI: ; %bb.0: 765; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 766; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 767; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 768; SI-NEXT: s_load_dword s6, s[8:9], 0x20 769; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000 770; SI-NEXT: s_mov_b32 s3, 0x100f000 771; SI-NEXT: s_waitcnt lgkmcnt(0) 772; SI-NEXT: v_mov_b32_e32 v0, s12 773; SI-NEXT: v_mov_b32_e32 v1, s13 774; SI-NEXT: v_mov_b32_e32 v2, s14 775; SI-NEXT: v_mov_b32_e32 v3, s15 776; SI-NEXT: v_mov_b32_e32 v4, s16 777; SI-NEXT: v_mov_b32_e32 v5, s17 778; SI-NEXT: v_mov_b32_e32 v6, s18 779; SI-NEXT: v_mov_b32_e32 v7, s19 780; SI-NEXT: v_mov_b32_e32 v8, s4 781; SI-NEXT: v_mov_b32_e32 v9, s5 782; SI-NEXT: s_mov_b32 m0, s6 783; SI-NEXT: s_mov_b32 s2, -1 784; SI-NEXT: v_movreld_b32_e32 v0, v10 785; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 786; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 787; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 788; SI-NEXT: s_endpgm 789; 790; VI-LABEL: dynamic_insertelement_v10f32: 791; VI: ; %bb.0: 792; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 793; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x60 794; VI-NEXT: s_load_dword s6, s[8:9], 0x80 795; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 796; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000 797; VI-NEXT: s_waitcnt lgkmcnt(0) 798; VI-NEXT: v_mov_b32_e32 v0, s12 799; VI-NEXT: v_mov_b32_e32 v1, s13 800; VI-NEXT: v_mov_b32_e32 v2, s14 801; VI-NEXT: v_mov_b32_e32 v3, s15 802; VI-NEXT: v_mov_b32_e32 v4, s16 803; VI-NEXT: v_mov_b32_e32 v5, s17 804; VI-NEXT: v_mov_b32_e32 v6, s18 805; VI-NEXT: v_mov_b32_e32 v7, s19 806; VI-NEXT: v_mov_b32_e32 v8, s4 807; VI-NEXT: v_mov_b32_e32 v9, s5 808; VI-NEXT: s_mov_b32 m0, s6 809; VI-NEXT: s_mov_b32 s3, 0x1100f000 810; VI-NEXT: s_mov_b32 s2, -1 811; VI-NEXT: v_movreld_b32_e32 v0, v10 812; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 813; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 814; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 815; VI-NEXT: s_endpgm 816 %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b 817 store <10 x float> %vecins, ptr addrspace(1) %out, align 32 818 ret void 819} 820 821define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind { 822; SI-LABEL: dynamic_insertelement_v11f32: 823; SI: ; %bb.0: 824; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 825; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 826; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 827; SI-NEXT: s_waitcnt lgkmcnt(0) 828; SI-NEXT: s_load_dword s7, s[8:9], 0x20 829; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000 830; SI-NEXT: s_mov_b32 s3, 0x100f000 831; SI-NEXT: v_mov_b32_e32 v0, s12 832; SI-NEXT: v_mov_b32_e32 v1, s13 833; SI-NEXT: v_mov_b32_e32 v2, s14 834; SI-NEXT: v_mov_b32_e32 v3, s15 835; SI-NEXT: v_mov_b32_e32 v4, s16 836; SI-NEXT: v_mov_b32_e32 v5, s17 837; SI-NEXT: v_mov_b32_e32 v6, s18 838; SI-NEXT: v_mov_b32_e32 v7, s19 839; SI-NEXT: v_mov_b32_e32 v8, s4 840; SI-NEXT: v_mov_b32_e32 v9, s5 841; SI-NEXT: v_mov_b32_e32 v10, s6 842; SI-NEXT: s_waitcnt lgkmcnt(0) 843; SI-NEXT: s_mov_b32 m0, s7 844; SI-NEXT: s_mov_b32 s2, -1 845; SI-NEXT: v_movreld_b32_e32 v0, v11 846; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 847; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 848; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 849; SI-NEXT: s_endpgm 850; 851; VI-LABEL: dynamic_insertelement_v11f32: 852; VI: ; %bb.0: 853; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 854; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 855; VI-NEXT: s_waitcnt lgkmcnt(0) 856; VI-NEXT: s_load_dword s7, s[8:9], 0x80 857; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 858; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000 859; VI-NEXT: v_mov_b32_e32 v8, s4 860; VI-NEXT: v_mov_b32_e32 v0, s12 861; VI-NEXT: v_mov_b32_e32 v1, s13 862; VI-NEXT: v_mov_b32_e32 v2, s14 863; VI-NEXT: v_mov_b32_e32 v3, s15 864; VI-NEXT: v_mov_b32_e32 v4, s16 865; VI-NEXT: v_mov_b32_e32 v5, s17 866; VI-NEXT: v_mov_b32_e32 v6, s18 867; VI-NEXT: v_mov_b32_e32 v7, s19 868; VI-NEXT: v_mov_b32_e32 v9, s5 869; VI-NEXT: v_mov_b32_e32 v10, s6 870; VI-NEXT: s_waitcnt lgkmcnt(0) 871; VI-NEXT: s_mov_b32 m0, s7 872; VI-NEXT: s_mov_b32 s3, 0x1100f000 873; VI-NEXT: s_mov_b32 s2, -1 874; VI-NEXT: v_movreld_b32_e32 v0, v11 875; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 876; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 877; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 878; VI-NEXT: s_endpgm 879 %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b 880 store <11 x float> %vecins, ptr addrspace(1) %out, align 32 881 ret void 882} 883 884define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind { 885; SI-LABEL: dynamic_insertelement_v12f32: 886; SI: ; %bb.0: 887; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 888; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 889; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 890; SI-NEXT: s_load_dword s8, s[8:9], 0x20 891; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000 892; SI-NEXT: s_mov_b32 s3, 0x100f000 893; SI-NEXT: s_waitcnt lgkmcnt(0) 894; SI-NEXT: v_mov_b32_e32 v0, s12 895; SI-NEXT: v_mov_b32_e32 v1, s13 896; SI-NEXT: v_mov_b32_e32 v2, s14 897; SI-NEXT: v_mov_b32_e32 v3, s15 898; SI-NEXT: v_mov_b32_e32 v4, s16 899; SI-NEXT: v_mov_b32_e32 v5, s17 900; SI-NEXT: v_mov_b32_e32 v6, s18 901; SI-NEXT: v_mov_b32_e32 v7, s19 902; SI-NEXT: v_mov_b32_e32 v8, s4 903; SI-NEXT: v_mov_b32_e32 v9, s5 904; SI-NEXT: v_mov_b32_e32 v10, s6 905; SI-NEXT: v_mov_b32_e32 v11, s7 906; SI-NEXT: s_mov_b32 m0, s8 907; SI-NEXT: s_mov_b32 s2, -1 908; SI-NEXT: v_movreld_b32_e32 v0, v12 909; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 910; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 911; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 912; SI-NEXT: s_endpgm 913; 914; VI-LABEL: dynamic_insertelement_v12f32: 915; VI: ; %bb.0: 916; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 917; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 918; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 919; VI-NEXT: s_load_dword s8, s[8:9], 0x80 920; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000 921; VI-NEXT: s_waitcnt lgkmcnt(0) 922; VI-NEXT: v_mov_b32_e32 v0, s12 923; VI-NEXT: v_mov_b32_e32 v1, s13 924; VI-NEXT: v_mov_b32_e32 v2, s14 925; VI-NEXT: v_mov_b32_e32 v3, s15 926; VI-NEXT: v_mov_b32_e32 v4, s16 927; VI-NEXT: v_mov_b32_e32 v5, s17 928; VI-NEXT: v_mov_b32_e32 v6, s18 929; VI-NEXT: v_mov_b32_e32 v7, s19 930; VI-NEXT: v_mov_b32_e32 v8, s4 931; VI-NEXT: v_mov_b32_e32 v9, s5 932; VI-NEXT: v_mov_b32_e32 v10, s6 933; VI-NEXT: v_mov_b32_e32 v11, s7 934; VI-NEXT: s_mov_b32 m0, s8 935; VI-NEXT: s_mov_b32 s3, 0x1100f000 936; VI-NEXT: s_mov_b32 s2, -1 937; VI-NEXT: v_movreld_b32_e32 v0, v12 938; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 939; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 940; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 941; VI-NEXT: s_endpgm 942 %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b 943 store <12 x float> %vecins, ptr addrspace(1) %out, align 32 944 ret void 945} 946 947define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind { 948; SI-LABEL: dynamic_insertelement_v16f32: 949; SI: ; %bb.0: 950; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 951; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 952; SI-NEXT: s_load_dword s4, s[8:9], 0x20 953; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 954; SI-NEXT: s_mov_b32 s3, 0x100f000 955; SI-NEXT: s_mov_b32 s2, -1 956; SI-NEXT: s_waitcnt lgkmcnt(0) 957; SI-NEXT: v_mov_b32_e32 v0, s12 958; SI-NEXT: v_mov_b32_e32 v1, s13 959; SI-NEXT: v_mov_b32_e32 v2, s14 960; SI-NEXT: v_mov_b32_e32 v3, s15 961; SI-NEXT: v_mov_b32_e32 v4, s16 962; SI-NEXT: v_mov_b32_e32 v5, s17 963; SI-NEXT: v_mov_b32_e32 v6, s18 964; SI-NEXT: v_mov_b32_e32 v7, s19 965; SI-NEXT: v_mov_b32_e32 v8, s20 966; SI-NEXT: v_mov_b32_e32 v9, s21 967; SI-NEXT: v_mov_b32_e32 v10, s22 968; SI-NEXT: v_mov_b32_e32 v11, s23 969; SI-NEXT: v_mov_b32_e32 v12, s24 970; SI-NEXT: v_mov_b32_e32 v13, s25 971; SI-NEXT: v_mov_b32_e32 v14, s26 972; SI-NEXT: v_mov_b32_e32 v15, s27 973; SI-NEXT: s_mov_b32 m0, s4 974; SI-NEXT: v_movreld_b32_e32 v0, v16 975; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 976; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 977; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 978; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 979; SI-NEXT: s_endpgm 980; 981; VI-LABEL: dynamic_insertelement_v16f32: 982; VI: ; %bb.0: 983; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 984; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 985; VI-NEXT: s_load_dword s4, s[8:9], 0x80 986; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 987; VI-NEXT: s_mov_b32 s3, 0x1100f000 988; VI-NEXT: s_mov_b32 s2, -1 989; VI-NEXT: s_waitcnt lgkmcnt(0) 990; VI-NEXT: v_mov_b32_e32 v0, s12 991; VI-NEXT: v_mov_b32_e32 v1, s13 992; VI-NEXT: v_mov_b32_e32 v2, s14 993; VI-NEXT: v_mov_b32_e32 v3, s15 994; VI-NEXT: v_mov_b32_e32 v4, s16 995; VI-NEXT: v_mov_b32_e32 v5, s17 996; VI-NEXT: v_mov_b32_e32 v6, s18 997; VI-NEXT: v_mov_b32_e32 v7, s19 998; VI-NEXT: v_mov_b32_e32 v8, s20 999; VI-NEXT: v_mov_b32_e32 v9, s21 1000; VI-NEXT: v_mov_b32_e32 v10, s22 1001; VI-NEXT: v_mov_b32_e32 v11, s23 1002; VI-NEXT: v_mov_b32_e32 v12, s24 1003; VI-NEXT: v_mov_b32_e32 v13, s25 1004; VI-NEXT: v_mov_b32_e32 v14, s26 1005; VI-NEXT: v_mov_b32_e32 v15, s27 1006; VI-NEXT: s_mov_b32 m0, s4 1007; VI-NEXT: v_movreld_b32_e32 v0, v16 1008; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1009; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1010; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1011; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1012; VI-NEXT: s_endpgm 1013 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 1014 store <16 x float> %vecins, ptr addrspace(1) %out, align 64 1015 ret void 1016} 1017 1018define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { 1019; SI-LABEL: dynamic_insertelement_v2i32: 1020; SI: ; %bb.0: 1021; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 1022; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1023; SI-NEXT: s_mov_b32 s7, 0x100f000 1024; SI-NEXT: s_mov_b32 s6, -1 1025; SI-NEXT: s_waitcnt lgkmcnt(0) 1026; SI-NEXT: s_cmp_lg_u32 s2, 1 1027; SI-NEXT: s_cselect_b32 s1, s1, 5 1028; SI-NEXT: s_cmp_lg_u32 s2, 0 1029; SI-NEXT: s_cselect_b32 s0, s0, 5 1030; SI-NEXT: v_mov_b32_e32 v0, s0 1031; SI-NEXT: v_mov_b32_e32 v1, s1 1032; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1033; SI-NEXT: s_endpgm 1034; 1035; VI-LABEL: dynamic_insertelement_v2i32: 1036; VI: ; %bb.0: 1037; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 1038; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1039; VI-NEXT: s_mov_b32 s7, 0x1100f000 1040; VI-NEXT: s_mov_b32 s6, -1 1041; VI-NEXT: s_waitcnt lgkmcnt(0) 1042; VI-NEXT: s_cmp_lg_u32 s2, 1 1043; VI-NEXT: s_cselect_b32 s1, s1, 5 1044; VI-NEXT: s_cmp_lg_u32 s2, 0 1045; VI-NEXT: s_cselect_b32 s0, s0, 5 1046; VI-NEXT: v_mov_b32_e32 v0, s0 1047; VI-NEXT: v_mov_b32_e32 v1, s1 1048; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1049; VI-NEXT: s_endpgm 1050 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 1051 store <2 x i32> %vecins, ptr addrspace(1) %out, align 8 1052 ret void 1053} 1054 1055define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind { 1056; SI-LABEL: dynamic_insertelement_v3i32: 1057; SI: ; %bb.0: 1058; SI-NEXT: s_load_dword s10, s[8:9], 0x8 1059; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 1060; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1061; SI-NEXT: s_mov_b32 s7, 0x100f000 1062; SI-NEXT: s_mov_b32 s6, -1 1063; SI-NEXT: s_waitcnt lgkmcnt(0) 1064; SI-NEXT: s_cmp_lg_u32 s10, 2 1065; SI-NEXT: s_cselect_b32 s2, s2, 5 1066; SI-NEXT: s_cmp_lg_u32 s10, 1 1067; SI-NEXT: s_cselect_b32 s1, s1, 5 1068; SI-NEXT: s_cmp_lg_u32 s10, 0 1069; SI-NEXT: s_cselect_b32 s0, s0, 5 1070; SI-NEXT: v_mov_b32_e32 v0, s0 1071; SI-NEXT: v_mov_b32_e32 v1, s1 1072; SI-NEXT: v_mov_b32_e32 v2, s2 1073; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 1074; SI-NEXT: s_endpgm 1075; 1076; VI-LABEL: dynamic_insertelement_v3i32: 1077; VI: ; %bb.0: 1078; VI-NEXT: s_load_dword s10, s[8:9], 0x20 1079; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1080; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1081; VI-NEXT: s_mov_b32 s7, 0x1100f000 1082; VI-NEXT: s_mov_b32 s6, -1 1083; VI-NEXT: s_waitcnt lgkmcnt(0) 1084; VI-NEXT: s_cmp_lg_u32 s10, 2 1085; VI-NEXT: s_cselect_b32 s2, s2, 5 1086; VI-NEXT: s_cmp_lg_u32 s10, 1 1087; VI-NEXT: s_cselect_b32 s1, s1, 5 1088; VI-NEXT: s_cmp_lg_u32 s10, 0 1089; VI-NEXT: s_cselect_b32 s0, s0, 5 1090; VI-NEXT: v_mov_b32_e32 v0, s0 1091; VI-NEXT: v_mov_b32_e32 v1, s1 1092; VI-NEXT: v_mov_b32_e32 v2, s2 1093; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 1094; VI-NEXT: s_endpgm 1095 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 1096 store <3 x i32> %vecins, ptr addrspace(1) %out, align 16 1097 ret void 1098} 1099 1100define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 1101; SI-LABEL: dynamic_insertelement_v4i32: 1102; SI: ; %bb.0: 1103; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 1104; SI-NEXT: s_load_dword s10, s[8:9], 0x8 1105; SI-NEXT: s_load_dword s11, s[8:9], 0x11 1106; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1107; SI-NEXT: s_mov_b32 s7, 0x100f000 1108; SI-NEXT: s_mov_b32 s6, -1 1109; SI-NEXT: s_waitcnt lgkmcnt(0) 1110; SI-NEXT: s_cmp_eq_u32 s10, 3 1111; SI-NEXT: s_cselect_b32 s3, s11, s3 1112; SI-NEXT: s_cmp_eq_u32 s10, 2 1113; SI-NEXT: s_cselect_b32 s2, s11, s2 1114; SI-NEXT: s_cmp_eq_u32 s10, 1 1115; SI-NEXT: s_cselect_b32 s1, s11, s1 1116; SI-NEXT: s_cmp_eq_u32 s10, 0 1117; SI-NEXT: s_cselect_b32 s0, s11, s0 1118; SI-NEXT: v_mov_b32_e32 v0, s0 1119; SI-NEXT: v_mov_b32_e32 v1, s1 1120; SI-NEXT: v_mov_b32_e32 v2, s2 1121; SI-NEXT: v_mov_b32_e32 v3, s3 1122; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1123; SI-NEXT: s_endpgm 1124; 1125; VI-LABEL: dynamic_insertelement_v4i32: 1126; VI: ; %bb.0: 1127; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1128; VI-NEXT: s_load_dword s10, s[8:9], 0x20 1129; VI-NEXT: s_load_dword s11, s[8:9], 0x44 1130; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1131; VI-NEXT: s_mov_b32 s7, 0x1100f000 1132; VI-NEXT: s_mov_b32 s6, -1 1133; VI-NEXT: s_waitcnt lgkmcnt(0) 1134; VI-NEXT: s_cmp_eq_u32 s10, 3 1135; VI-NEXT: s_cselect_b32 s3, s11, s3 1136; VI-NEXT: s_cmp_eq_u32 s10, 2 1137; VI-NEXT: s_cselect_b32 s2, s11, s2 1138; VI-NEXT: s_cmp_eq_u32 s10, 1 1139; VI-NEXT: s_cselect_b32 s1, s11, s1 1140; VI-NEXT: s_cmp_eq_u32 s10, 0 1141; VI-NEXT: s_cselect_b32 s0, s11, s0 1142; VI-NEXT: v_mov_b32_e32 v0, s0 1143; VI-NEXT: v_mov_b32_e32 v1, s1 1144; VI-NEXT: v_mov_b32_e32 v2, s2 1145; VI-NEXT: v_mov_b32_e32 v3, s3 1146; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1147; VI-NEXT: s_endpgm 1148 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 1149 store <4 x i32> %vecins, ptr addrspace(1) %out, align 16 1150 ret void 1151} 1152 1153define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind { 1154; SI-LABEL: dynamic_insertelement_v8i32: 1155; SI: ; %bb.0: 1156; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 1157; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 1158; SI-NEXT: s_load_dword s8, s[8:9], 0x10 1159; SI-NEXT: s_mov_b32 s15, 0x100f000 1160; SI-NEXT: s_mov_b32 s14, -1 1161; SI-NEXT: s_waitcnt lgkmcnt(0) 1162; SI-NEXT: v_mov_b32_e32 v0, s0 1163; SI-NEXT: v_mov_b32_e32 v1, s1 1164; SI-NEXT: v_mov_b32_e32 v2, s2 1165; SI-NEXT: v_mov_b32_e32 v3, s3 1166; SI-NEXT: v_mov_b32_e32 v4, s4 1167; SI-NEXT: v_mov_b32_e32 v5, s5 1168; SI-NEXT: v_mov_b32_e32 v6, s6 1169; SI-NEXT: v_mov_b32_e32 v7, s7 1170; SI-NEXT: s_mov_b32 m0, s8 1171; SI-NEXT: v_movreld_b32_e32 v0, 5 1172; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 1173; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1174; SI-NEXT: s_endpgm 1175; 1176; VI-LABEL: dynamic_insertelement_v8i32: 1177; VI: ; %bb.0: 1178; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20 1179; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1180; VI-NEXT: s_load_dword s4, s[8:9], 0x40 1181; VI-NEXT: s_mov_b32 s3, 0x1100f000 1182; VI-NEXT: s_mov_b32 s2, -1 1183; VI-NEXT: s_waitcnt lgkmcnt(0) 1184; VI-NEXT: v_mov_b32_e32 v0, s12 1185; VI-NEXT: v_mov_b32_e32 v1, s13 1186; VI-NEXT: v_mov_b32_e32 v2, s14 1187; VI-NEXT: v_mov_b32_e32 v3, s15 1188; VI-NEXT: v_mov_b32_e32 v4, s16 1189; VI-NEXT: v_mov_b32_e32 v5, s17 1190; VI-NEXT: v_mov_b32_e32 v6, s18 1191; VI-NEXT: v_mov_b32_e32 v7, s19 1192; VI-NEXT: s_mov_b32 m0, s4 1193; VI-NEXT: v_movreld_b32_e32 v0, 5 1194; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1195; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1196; VI-NEXT: s_endpgm 1197 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 1198 store <8 x i32> %vecins, ptr addrspace(1) %out, align 32 1199 ret void 1200} 1201 1202define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind { 1203; SI-LABEL: dynamic_insertelement_v9i32: 1204; SI: ; %bb.0: 1205; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1206; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 1207; SI-NEXT: s_load_dword s4, s[8:9], 0x18 1208; SI-NEXT: s_load_dword s5, s[8:9], 0x20 1209; SI-NEXT: s_mov_b32 s3, 0x100f000 1210; SI-NEXT: s_mov_b32 s2, -1 1211; SI-NEXT: s_waitcnt lgkmcnt(0) 1212; SI-NEXT: v_mov_b32_e32 v0, s12 1213; SI-NEXT: v_mov_b32_e32 v1, s13 1214; SI-NEXT: v_mov_b32_e32 v2, s14 1215; SI-NEXT: v_mov_b32_e32 v3, s15 1216; SI-NEXT: v_mov_b32_e32 v4, s16 1217; SI-NEXT: v_mov_b32_e32 v5, s17 1218; SI-NEXT: v_mov_b32_e32 v6, s18 1219; SI-NEXT: v_mov_b32_e32 v7, s19 1220; SI-NEXT: v_mov_b32_e32 v8, s4 1221; SI-NEXT: s_mov_b32 m0, s5 1222; SI-NEXT: v_movreld_b32_e32 v0, 5 1223; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 1224; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1225; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1226; SI-NEXT: s_endpgm 1227; 1228; VI-LABEL: dynamic_insertelement_v9i32: 1229; VI: ; %bb.0: 1230; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 1231; VI-NEXT: s_load_dword s4, s[8:9], 0x60 1232; VI-NEXT: s_load_dword s5, s[8:9], 0x80 1233; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1234; VI-NEXT: s_mov_b32 s3, 0x1100f000 1235; VI-NEXT: s_mov_b32 s2, -1 1236; VI-NEXT: s_waitcnt lgkmcnt(0) 1237; VI-NEXT: v_mov_b32_e32 v0, s12 1238; VI-NEXT: v_mov_b32_e32 v1, s13 1239; VI-NEXT: v_mov_b32_e32 v2, s14 1240; VI-NEXT: v_mov_b32_e32 v3, s15 1241; VI-NEXT: v_mov_b32_e32 v4, s16 1242; VI-NEXT: v_mov_b32_e32 v5, s17 1243; VI-NEXT: v_mov_b32_e32 v6, s18 1244; VI-NEXT: v_mov_b32_e32 v7, s19 1245; VI-NEXT: v_mov_b32_e32 v8, s4 1246; VI-NEXT: s_mov_b32 m0, s5 1247; VI-NEXT: v_movreld_b32_e32 v0, 5 1248; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 1249; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1250; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1251; VI-NEXT: s_endpgm 1252 %vecins = insertelement <9 x i32> %a, i32 5, i32 %b 1253 store <9 x i32> %vecins, ptr addrspace(1) %out, align 32 1254 ret void 1255} 1256 1257define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind { 1258; SI-LABEL: dynamic_insertelement_v10i32: 1259; SI: ; %bb.0: 1260; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1261; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 1262; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 1263; SI-NEXT: s_load_dword s6, s[8:9], 0x20 1264; SI-NEXT: s_mov_b32 s3, 0x100f000 1265; SI-NEXT: s_mov_b32 s2, -1 1266; SI-NEXT: s_waitcnt lgkmcnt(0) 1267; SI-NEXT: v_mov_b32_e32 v0, s12 1268; SI-NEXT: v_mov_b32_e32 v1, s13 1269; SI-NEXT: v_mov_b32_e32 v2, s14 1270; SI-NEXT: v_mov_b32_e32 v3, s15 1271; SI-NEXT: v_mov_b32_e32 v4, s16 1272; SI-NEXT: v_mov_b32_e32 v5, s17 1273; SI-NEXT: v_mov_b32_e32 v6, s18 1274; SI-NEXT: v_mov_b32_e32 v7, s19 1275; SI-NEXT: v_mov_b32_e32 v8, s4 1276; SI-NEXT: v_mov_b32_e32 v9, s5 1277; SI-NEXT: s_mov_b32 m0, s6 1278; SI-NEXT: v_movreld_b32_e32 v0, 5 1279; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1280; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1281; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 1282; SI-NEXT: s_endpgm 1283; 1284; VI-LABEL: dynamic_insertelement_v10i32: 1285; VI: ; %bb.0: 1286; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 1287; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x60 1288; VI-NEXT: s_load_dword s6, s[8:9], 0x80 1289; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1290; VI-NEXT: s_mov_b32 s3, 0x1100f000 1291; VI-NEXT: s_waitcnt lgkmcnt(0) 1292; VI-NEXT: v_mov_b32_e32 v0, s12 1293; VI-NEXT: v_mov_b32_e32 v1, s13 1294; VI-NEXT: v_mov_b32_e32 v2, s14 1295; VI-NEXT: v_mov_b32_e32 v3, s15 1296; VI-NEXT: v_mov_b32_e32 v4, s16 1297; VI-NEXT: v_mov_b32_e32 v5, s17 1298; VI-NEXT: v_mov_b32_e32 v6, s18 1299; VI-NEXT: v_mov_b32_e32 v7, s19 1300; VI-NEXT: v_mov_b32_e32 v8, s4 1301; VI-NEXT: v_mov_b32_e32 v9, s5 1302; VI-NEXT: s_mov_b32 m0, s6 1303; VI-NEXT: s_mov_b32 s2, -1 1304; VI-NEXT: v_movreld_b32_e32 v0, 5 1305; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1306; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1307; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 1308; VI-NEXT: s_endpgm 1309 %vecins = insertelement <10 x i32> %a, i32 5, i32 %b 1310 store <10 x i32> %vecins, ptr addrspace(1) %out, align 32 1311 ret void 1312} 1313 1314define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind { 1315; SI-LABEL: dynamic_insertelement_v11i32: 1316; SI: ; %bb.0: 1317; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1318; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 1319; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 1320; SI-NEXT: s_waitcnt lgkmcnt(0) 1321; SI-NEXT: s_load_dword s7, s[8:9], 0x20 1322; SI-NEXT: s_mov_b32 s3, 0x100f000 1323; SI-NEXT: s_mov_b32 s2, -1 1324; SI-NEXT: v_mov_b32_e32 v0, s12 1325; SI-NEXT: v_mov_b32_e32 v1, s13 1326; SI-NEXT: v_mov_b32_e32 v2, s14 1327; SI-NEXT: v_mov_b32_e32 v3, s15 1328; SI-NEXT: v_mov_b32_e32 v4, s16 1329; SI-NEXT: v_mov_b32_e32 v5, s17 1330; SI-NEXT: v_mov_b32_e32 v6, s18 1331; SI-NEXT: v_mov_b32_e32 v7, s19 1332; SI-NEXT: v_mov_b32_e32 v8, s4 1333; SI-NEXT: v_mov_b32_e32 v9, s5 1334; SI-NEXT: v_mov_b32_e32 v10, s6 1335; SI-NEXT: s_waitcnt lgkmcnt(0) 1336; SI-NEXT: s_mov_b32 m0, s7 1337; SI-NEXT: v_movreld_b32_e32 v0, 5 1338; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1339; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1340; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 1341; SI-NEXT: s_endpgm 1342; 1343; VI-LABEL: dynamic_insertelement_v11i32: 1344; VI: ; %bb.0: 1345; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 1346; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 1347; VI-NEXT: s_waitcnt lgkmcnt(0) 1348; VI-NEXT: s_load_dword s7, s[8:9], 0x80 1349; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1350; VI-NEXT: s_mov_b32 s3, 0x1100f000 1351; VI-NEXT: v_mov_b32_e32 v8, s4 1352; VI-NEXT: v_mov_b32_e32 v0, s12 1353; VI-NEXT: v_mov_b32_e32 v1, s13 1354; VI-NEXT: v_mov_b32_e32 v2, s14 1355; VI-NEXT: v_mov_b32_e32 v3, s15 1356; VI-NEXT: v_mov_b32_e32 v4, s16 1357; VI-NEXT: v_mov_b32_e32 v5, s17 1358; VI-NEXT: v_mov_b32_e32 v6, s18 1359; VI-NEXT: v_mov_b32_e32 v7, s19 1360; VI-NEXT: v_mov_b32_e32 v9, s5 1361; VI-NEXT: v_mov_b32_e32 v10, s6 1362; VI-NEXT: s_waitcnt lgkmcnt(0) 1363; VI-NEXT: s_mov_b32 m0, s7 1364; VI-NEXT: s_mov_b32 s2, -1 1365; VI-NEXT: v_movreld_b32_e32 v0, 5 1366; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1367; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1368; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 1369; VI-NEXT: s_endpgm 1370 %vecins = insertelement <11 x i32> %a, i32 5, i32 %b 1371 store <11 x i32> %vecins, ptr addrspace(1) %out, align 32 1372 ret void 1373} 1374 1375define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind { 1376; SI-LABEL: dynamic_insertelement_v12i32: 1377; SI: ; %bb.0: 1378; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1379; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 1380; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 1381; SI-NEXT: s_load_dword s8, s[8:9], 0x20 1382; SI-NEXT: s_mov_b32 s3, 0x100f000 1383; SI-NEXT: s_mov_b32 s2, -1 1384; SI-NEXT: s_waitcnt lgkmcnt(0) 1385; SI-NEXT: v_mov_b32_e32 v0, s12 1386; SI-NEXT: v_mov_b32_e32 v1, s13 1387; SI-NEXT: v_mov_b32_e32 v2, s14 1388; SI-NEXT: v_mov_b32_e32 v3, s15 1389; SI-NEXT: v_mov_b32_e32 v4, s16 1390; SI-NEXT: v_mov_b32_e32 v5, s17 1391; SI-NEXT: v_mov_b32_e32 v6, s18 1392; SI-NEXT: v_mov_b32_e32 v7, s19 1393; SI-NEXT: v_mov_b32_e32 v8, s4 1394; SI-NEXT: v_mov_b32_e32 v9, s5 1395; SI-NEXT: v_mov_b32_e32 v10, s6 1396; SI-NEXT: v_mov_b32_e32 v11, s7 1397; SI-NEXT: s_mov_b32 m0, s8 1398; SI-NEXT: v_movreld_b32_e32 v0, 5 1399; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1400; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1401; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1402; SI-NEXT: s_endpgm 1403; 1404; VI-LABEL: dynamic_insertelement_v12i32: 1405; VI: ; %bb.0: 1406; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 1407; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1408; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 1409; VI-NEXT: s_load_dword s8, s[8:9], 0x80 1410; VI-NEXT: s_mov_b32 s3, 0x1100f000 1411; VI-NEXT: s_waitcnt lgkmcnt(0) 1412; VI-NEXT: v_mov_b32_e32 v0, s12 1413; VI-NEXT: v_mov_b32_e32 v1, s13 1414; VI-NEXT: v_mov_b32_e32 v2, s14 1415; VI-NEXT: v_mov_b32_e32 v3, s15 1416; VI-NEXT: v_mov_b32_e32 v4, s16 1417; VI-NEXT: v_mov_b32_e32 v5, s17 1418; VI-NEXT: v_mov_b32_e32 v6, s18 1419; VI-NEXT: v_mov_b32_e32 v7, s19 1420; VI-NEXT: v_mov_b32_e32 v8, s4 1421; VI-NEXT: v_mov_b32_e32 v9, s5 1422; VI-NEXT: v_mov_b32_e32 v10, s6 1423; VI-NEXT: v_mov_b32_e32 v11, s7 1424; VI-NEXT: s_mov_b32 m0, s8 1425; VI-NEXT: s_mov_b32 s2, -1 1426; VI-NEXT: v_movreld_b32_e32 v0, 5 1427; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1428; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1429; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1430; VI-NEXT: s_endpgm 1431 %vecins = insertelement <12 x i32> %a, i32 5, i32 %b 1432 store <12 x i32> %vecins, ptr addrspace(1) %out, align 32 1433 ret void 1434} 1435 1436define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind { 1437; SI-LABEL: dynamic_insertelement_v16i32: 1438; SI: ; %bb.0: 1439; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 1440; SI-NEXT: s_load_dword s4, s[8:9], 0x20 1441; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1442; SI-NEXT: s_mov_b32 s3, 0x100f000 1443; SI-NEXT: s_mov_b32 s2, -1 1444; SI-NEXT: s_waitcnt lgkmcnt(0) 1445; SI-NEXT: v_mov_b32_e32 v0, s12 1446; SI-NEXT: v_mov_b32_e32 v1, s13 1447; SI-NEXT: v_mov_b32_e32 v2, s14 1448; SI-NEXT: v_mov_b32_e32 v3, s15 1449; SI-NEXT: v_mov_b32_e32 v4, s16 1450; SI-NEXT: v_mov_b32_e32 v5, s17 1451; SI-NEXT: v_mov_b32_e32 v6, s18 1452; SI-NEXT: v_mov_b32_e32 v7, s19 1453; SI-NEXT: v_mov_b32_e32 v8, s20 1454; SI-NEXT: v_mov_b32_e32 v9, s21 1455; SI-NEXT: v_mov_b32_e32 v10, s22 1456; SI-NEXT: v_mov_b32_e32 v11, s23 1457; SI-NEXT: v_mov_b32_e32 v12, s24 1458; SI-NEXT: v_mov_b32_e32 v13, s25 1459; SI-NEXT: v_mov_b32_e32 v14, s26 1460; SI-NEXT: v_mov_b32_e32 v15, s27 1461; SI-NEXT: s_mov_b32 m0, s4 1462; SI-NEXT: v_movreld_b32_e32 v0, 5 1463; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1464; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1465; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1466; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1467; SI-NEXT: s_endpgm 1468; 1469; VI-LABEL: dynamic_insertelement_v16i32: 1470; VI: ; %bb.0: 1471; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 1472; VI-NEXT: s_load_dword s4, s[8:9], 0x80 1473; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1474; VI-NEXT: s_mov_b32 s3, 0x1100f000 1475; VI-NEXT: s_mov_b32 s2, -1 1476; VI-NEXT: s_waitcnt lgkmcnt(0) 1477; VI-NEXT: v_mov_b32_e32 v0, s12 1478; VI-NEXT: v_mov_b32_e32 v1, s13 1479; VI-NEXT: v_mov_b32_e32 v2, s14 1480; VI-NEXT: v_mov_b32_e32 v3, s15 1481; VI-NEXT: v_mov_b32_e32 v4, s16 1482; VI-NEXT: v_mov_b32_e32 v5, s17 1483; VI-NEXT: v_mov_b32_e32 v6, s18 1484; VI-NEXT: v_mov_b32_e32 v7, s19 1485; VI-NEXT: v_mov_b32_e32 v8, s20 1486; VI-NEXT: v_mov_b32_e32 v9, s21 1487; VI-NEXT: v_mov_b32_e32 v10, s22 1488; VI-NEXT: v_mov_b32_e32 v11, s23 1489; VI-NEXT: v_mov_b32_e32 v12, s24 1490; VI-NEXT: v_mov_b32_e32 v13, s25 1491; VI-NEXT: v_mov_b32_e32 v14, s26 1492; VI-NEXT: v_mov_b32_e32 v15, s27 1493; VI-NEXT: s_mov_b32 m0, s4 1494; VI-NEXT: v_movreld_b32_e32 v0, 5 1495; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1496; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1497; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1498; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1499; VI-NEXT: s_endpgm 1500 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 1501 store <16 x i32> %vecins, ptr addrspace(1) %out, align 64 1502 ret void 1503} 1504 1505define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind { 1506; SI-LABEL: dynamic_insertelement_v2i16: 1507; SI: ; %bb.0: 1508; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1509; SI-NEXT: s_mov_b32 s7, 0x100f000 1510; SI-NEXT: s_mov_b32 s6, -1 1511; SI-NEXT: s_waitcnt lgkmcnt(0) 1512; SI-NEXT: s_mov_b32 s4, s0 1513; SI-NEXT: s_lshl_b32 s0, s3, 4 1514; SI-NEXT: s_lshl_b32 s0, 0xffff, s0 1515; SI-NEXT: s_mov_b32 s5, s1 1516; SI-NEXT: s_andn2_b32 s1, s2, s0 1517; SI-NEXT: s_and_b32 s0, s0, 0x50005 1518; SI-NEXT: s_or_b32 s0, s0, s1 1519; SI-NEXT: v_mov_b32_e32 v0, s0 1520; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1521; SI-NEXT: s_endpgm 1522; 1523; VI-LABEL: dynamic_insertelement_v2i16: 1524; VI: ; %bb.0: 1525; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1526; VI-NEXT: s_mov_b32 s7, 0x1100f000 1527; VI-NEXT: s_mov_b32 s6, -1 1528; VI-NEXT: s_waitcnt lgkmcnt(0) 1529; VI-NEXT: s_mov_b32 s4, s0 1530; VI-NEXT: s_lshl_b32 s0, s3, 4 1531; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1532; VI-NEXT: s_mov_b32 s5, s1 1533; VI-NEXT: s_andn2_b32 s1, s2, s0 1534; VI-NEXT: s_and_b32 s0, s0, 0x50005 1535; VI-NEXT: s_or_b32 s0, s0, s1 1536; VI-NEXT: v_mov_b32_e32 v0, s0 1537; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1538; VI-NEXT: s_endpgm 1539 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 1540 store <2 x i16> %vecins, ptr addrspace(1) %out, align 8 1541 ret void 1542} 1543 1544define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind { 1545; SI-LABEL: dynamic_insertelement_v3i16: 1546; SI: ; %bb.0: 1547; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1548; SI-NEXT: s_load_dword s8, s[8:9], 0x4 1549; SI-NEXT: s_mov_b32 s7, 0x100f000 1550; SI-NEXT: s_mov_b32 s6, -1 1551; SI-NEXT: s_waitcnt lgkmcnt(0) 1552; SI-NEXT: s_mov_b32 s4, s0 1553; SI-NEXT: s_lshl_b32 s0, s8, 4 1554; SI-NEXT: s_mov_b32 s5, s1 1555; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 1556; SI-NEXT: s_and_b32 s9, s1, 0x50005 1557; SI-NEXT: s_and_b32 s8, s0, 0x50005 1558; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] 1559; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] 1560; SI-NEXT: v_mov_b32_e32 v0, s1 1561; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 1562; SI-NEXT: v_mov_b32_e32 v0, s0 1563; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1564; SI-NEXT: s_endpgm 1565; 1566; VI-LABEL: dynamic_insertelement_v3i16: 1567; VI: ; %bb.0: 1568; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1569; VI-NEXT: s_load_dword s8, s[8:9], 0x10 1570; VI-NEXT: s_mov_b32 s7, 0x1100f000 1571; VI-NEXT: s_mov_b32 s6, -1 1572; VI-NEXT: s_waitcnt lgkmcnt(0) 1573; VI-NEXT: s_mov_b32 s4, s0 1574; VI-NEXT: s_lshl_b32 s0, s8, 4 1575; VI-NEXT: s_mov_b32 s8, 0x50005 1576; VI-NEXT: s_mov_b32 s5, s1 1577; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 1578; VI-NEXT: s_mov_b32 s9, s8 1579; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 1580; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] 1581; VI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1582; VI-NEXT: v_mov_b32_e32 v0, s1 1583; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 1584; VI-NEXT: v_mov_b32_e32 v0, s0 1585; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1586; VI-NEXT: s_endpgm 1587 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 1588 store <3 x i16> %vecins, ptr addrspace(1) %out, align 8 1589 ret void 1590} 1591 1592define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 1593; SI-LABEL: dynamic_insertelement_v2i8: 1594; SI: ; %bb.0: 1595; SI-NEXT: s_load_dword s4, s[8:9], 0x13 1596; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1597; SI-NEXT: s_load_dword s5, s[8:9], 0xa 1598; SI-NEXT: s_mov_b32 s3, 0x100f000 1599; SI-NEXT: s_mov_b32 s2, -1 1600; SI-NEXT: s_waitcnt lgkmcnt(0) 1601; SI-NEXT: s_lshl_b32 s4, s4, 3 1602; SI-NEXT: s_lshl_b32 s4, 0xff, s4 1603; SI-NEXT: s_andn2_b32 s5, s5, s4 1604; SI-NEXT: s_and_b32 s4, s4, 0x505 1605; SI-NEXT: s_or_b32 s4, s4, s5 1606; SI-NEXT: v_mov_b32_e32 v0, s4 1607; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1608; SI-NEXT: s_endpgm 1609; 1610; VI-LABEL: dynamic_insertelement_v2i8: 1611; VI: ; %bb.0: 1612; VI-NEXT: s_load_dword s4, s[8:9], 0x4c 1613; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1614; VI-NEXT: s_load_dword s5, s[8:9], 0x28 1615; VI-NEXT: s_mov_b32 s3, 0x1100f000 1616; VI-NEXT: s_mov_b32 s2, -1 1617; VI-NEXT: s_waitcnt lgkmcnt(0) 1618; VI-NEXT: s_lshl_b32 s4, s4, 3 1619; VI-NEXT: s_lshl_b32 s4, 0xff, s4 1620; VI-NEXT: s_and_b32 s6, s4, 0x505 1621; VI-NEXT: s_xor_b32 s4, s4, 0xffff 1622; VI-NEXT: s_and_b32 s4, s4, s5 1623; VI-NEXT: s_or_b32 s4, s6, s4 1624; VI-NEXT: v_mov_b32_e32 v0, s4 1625; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1626; VI-NEXT: s_endpgm 1627 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 1628 store <2 x i8> %vecins, ptr addrspace(1) %out, align 8 1629 ret void 1630} 1631 1632; FIXME: post legalize i16 and i32 shifts aren't merged because of 1633; isTypeDesirableForOp in SimplifyDemandedBits 1634define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 1635; SI-LABEL: dynamic_insertelement_v3i8: 1636; SI: ; %bb.0: 1637; SI-NEXT: s_load_dword s4, s[8:9], 0x13 1638; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1639; SI-NEXT: s_load_dword s5, s[8:9], 0xa 1640; SI-NEXT: s_mov_b32 s3, 0x100f000 1641; SI-NEXT: s_mov_b32 s2, -1 1642; SI-NEXT: s_waitcnt lgkmcnt(0) 1643; SI-NEXT: s_lshl_b32 s4, s4, 3 1644; SI-NEXT: s_lshl_b32 s4, 0xff, s4 1645; SI-NEXT: s_andn2_b32 s5, s5, s4 1646; SI-NEXT: s_and_b32 s4, s4, 0x5050505 1647; SI-NEXT: s_or_b32 s4, s4, s5 1648; SI-NEXT: s_lshr_b32 s5, s4, 16 1649; SI-NEXT: v_mov_b32_e32 v0, s4 1650; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1651; SI-NEXT: v_mov_b32_e32 v0, s5 1652; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1653; SI-NEXT: s_endpgm 1654; 1655; VI-LABEL: dynamic_insertelement_v3i8: 1656; VI: ; %bb.0: 1657; VI-NEXT: s_load_dword s4, s[8:9], 0x4c 1658; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1659; VI-NEXT: s_load_dword s5, s[8:9], 0x28 1660; VI-NEXT: s_mov_b32 s3, 0x1100f000 1661; VI-NEXT: s_mov_b32 s2, -1 1662; VI-NEXT: s_waitcnt lgkmcnt(0) 1663; VI-NEXT: s_lshl_b32 s4, s4, 3 1664; VI-NEXT: s_lshl_b32 s4, 0xff, s4 1665; VI-NEXT: s_andn2_b32 s5, s5, s4 1666; VI-NEXT: s_and_b32 s4, s4, 0x5050505 1667; VI-NEXT: s_or_b32 s4, s4, s5 1668; VI-NEXT: s_lshr_b32 s5, s4, 16 1669; VI-NEXT: v_mov_b32_e32 v0, s4 1670; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1671; VI-NEXT: v_mov_b32_e32 v0, s5 1672; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1673; VI-NEXT: s_endpgm 1674 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 1675 store <3 x i8> %vecins, ptr addrspace(1) %out, align 4 1676 ret void 1677} 1678 1679define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 1680; SI-LABEL: dynamic_insertelement_v4i8: 1681; SI: ; %bb.0: 1682; SI-NEXT: s_load_dword s4, s[8:9], 0x13 1683; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1684; SI-NEXT: s_load_dword s5, s[8:9], 0xa 1685; SI-NEXT: s_mov_b32 s3, 0x100f000 1686; SI-NEXT: s_mov_b32 s2, -1 1687; SI-NEXT: s_waitcnt lgkmcnt(0) 1688; SI-NEXT: s_lshl_b32 s4, s4, 3 1689; SI-NEXT: s_lshl_b32 s4, 0xff, s4 1690; SI-NEXT: s_andn2_b32 s5, s5, s4 1691; SI-NEXT: s_and_b32 s4, s4, 0x5050505 1692; SI-NEXT: s_or_b32 s4, s4, s5 1693; SI-NEXT: v_mov_b32_e32 v0, s4 1694; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1695; SI-NEXT: s_endpgm 1696; 1697; VI-LABEL: dynamic_insertelement_v4i8: 1698; VI: ; %bb.0: 1699; VI-NEXT: s_load_dword s4, s[8:9], 0x4c 1700; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1701; VI-NEXT: s_load_dword s5, s[8:9], 0x28 1702; VI-NEXT: s_mov_b32 s3, 0x1100f000 1703; VI-NEXT: s_mov_b32 s2, -1 1704; VI-NEXT: s_waitcnt lgkmcnt(0) 1705; VI-NEXT: s_lshl_b32 s4, s4, 3 1706; VI-NEXT: s_lshl_b32 s4, 0xff, s4 1707; VI-NEXT: s_andn2_b32 s5, s5, s4 1708; VI-NEXT: s_and_b32 s4, s4, 0x5050505 1709; VI-NEXT: s_or_b32 s4, s4, s5 1710; VI-NEXT: v_mov_b32_e32 v0, s4 1711; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1712; VI-NEXT: s_endpgm 1713 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 1714 store <4 x i8> %vecins, ptr addrspace(1) %out, align 4 1715 ret void 1716} 1717 1718define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind { 1719; SI-LABEL: s_dynamic_insertelement_v8i8: 1720; SI: ; %bb.0: 1721; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1722; SI-NEXT: s_load_dword s8, s[8:9], 0x4 1723; SI-NEXT: s_mov_b32 s7, 0x100f000 1724; SI-NEXT: s_mov_b32 s6, -1 1725; SI-NEXT: s_waitcnt lgkmcnt(0) 1726; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1727; SI-NEXT: s_mov_b32 s4, s0 1728; SI-NEXT: s_lshl_b32 s0, s8, 3 1729; SI-NEXT: s_mov_b32 s5, s1 1730; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 1731; SI-NEXT: s_and_b32 s9, s1, 0x5050505 1732; SI-NEXT: s_waitcnt lgkmcnt(0) 1733; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 1734; SI-NEXT: s_and_b32 s8, s0, 0x5050505 1735; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] 1736; SI-NEXT: v_mov_b32_e32 v0, s0 1737; SI-NEXT: v_mov_b32_e32 v1, s1 1738; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1739; SI-NEXT: s_endpgm 1740; 1741; VI-LABEL: s_dynamic_insertelement_v8i8: 1742; VI: ; %bb.0: 1743; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1744; VI-NEXT: s_load_dword s8, s[8:9], 0x10 1745; VI-NEXT: s_mov_b32 s7, 0x1100f000 1746; VI-NEXT: s_mov_b32 s6, -1 1747; VI-NEXT: s_waitcnt lgkmcnt(0) 1748; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1749; VI-NEXT: s_mov_b32 s4, s0 1750; VI-NEXT: s_lshl_b32 s0, s8, 3 1751; VI-NEXT: s_mov_b32 s5, s1 1752; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 1753; VI-NEXT: s_and_b32 s9, s1, 0x5050505 1754; VI-NEXT: s_waitcnt lgkmcnt(0) 1755; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 1756; VI-NEXT: s_and_b32 s8, s0, 0x5050505 1757; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] 1758; VI-NEXT: v_mov_b32_e32 v0, s0 1759; VI-NEXT: v_mov_b32_e32 v1, s1 1760; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1761; VI-NEXT: s_endpgm 1762 %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4 1763 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 1764 store <8 x i8> %vecins, ptr addrspace(1) %out, align 8 1765 ret void 1766} 1767 1768define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind { 1769; SI-LABEL: dynamic_insertelement_v16i8: 1770; SI: ; %bb.0: 1771; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4 1772; SI-NEXT: s_load_dword s10, s[8:9], 0x8 1773; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1774; SI-NEXT: s_mov_b32 s3, 0x100f000 1775; SI-NEXT: s_mov_b32 s2, -1 1776; SI-NEXT: s_waitcnt lgkmcnt(0) 1777; SI-NEXT: s_lshr_b32 s8, s7, 24 1778; SI-NEXT: s_cmp_lg_u32 s10, 15 1779; SI-NEXT: s_cselect_b32 s8, s8, 5 1780; SI-NEXT: s_lshl_b32 s8, s8, 24 1781; SI-NEXT: s_lshr_b32 s9, s7, 16 1782; SI-NEXT: s_cmp_lg_u32 s10, 14 1783; SI-NEXT: s_cselect_b32 s9, s9, 5 1784; SI-NEXT: s_and_b32 s9, s9, 0xff 1785; SI-NEXT: s_lshl_b32 s9, s9, 16 1786; SI-NEXT: s_or_b32 s8, s8, s9 1787; SI-NEXT: s_lshr_b32 s9, s7, 8 1788; SI-NEXT: s_cmp_lg_u32 s10, 13 1789; SI-NEXT: s_cselect_b32 s9, s9, 5 1790; SI-NEXT: s_lshl_b32 s9, s9, 8 1791; SI-NEXT: s_cmp_lg_u32 s10, 12 1792; SI-NEXT: s_cselect_b32 s7, s7, 5 1793; SI-NEXT: s_and_b32 s7, s7, 0xff 1794; SI-NEXT: s_or_b32 s7, s7, s9 1795; SI-NEXT: s_and_b32 s7, s7, 0xffff 1796; SI-NEXT: s_or_b32 s7, s7, s8 1797; SI-NEXT: s_lshr_b32 s8, s6, 24 1798; SI-NEXT: s_cmp_lg_u32 s10, 11 1799; SI-NEXT: s_cselect_b32 s8, s8, 5 1800; SI-NEXT: s_lshl_b32 s8, s8, 24 1801; SI-NEXT: s_lshr_b32 s9, s6, 16 1802; SI-NEXT: s_cmp_lg_u32 s10, 10 1803; SI-NEXT: s_cselect_b32 s9, s9, 5 1804; SI-NEXT: s_and_b32 s9, s9, 0xff 1805; SI-NEXT: s_lshl_b32 s9, s9, 16 1806; SI-NEXT: s_or_b32 s8, s8, s9 1807; SI-NEXT: s_lshr_b32 s9, s6, 8 1808; SI-NEXT: s_cmp_lg_u32 s10, 9 1809; SI-NEXT: s_cselect_b32 s9, s9, 5 1810; SI-NEXT: s_lshl_b32 s9, s9, 8 1811; SI-NEXT: s_cmp_lg_u32 s10, 8 1812; SI-NEXT: s_cselect_b32 s6, s6, 5 1813; SI-NEXT: s_and_b32 s6, s6, 0xff 1814; SI-NEXT: s_or_b32 s6, s6, s9 1815; SI-NEXT: s_and_b32 s6, s6, 0xffff 1816; SI-NEXT: s_or_b32 s6, s6, s8 1817; SI-NEXT: s_lshr_b32 s8, s5, 24 1818; SI-NEXT: s_cmp_lg_u32 s10, 7 1819; SI-NEXT: s_cselect_b32 s8, s8, 5 1820; SI-NEXT: s_lshl_b32 s8, s8, 24 1821; SI-NEXT: s_lshr_b32 s9, s5, 16 1822; SI-NEXT: s_cmp_lg_u32 s10, 6 1823; SI-NEXT: s_cselect_b32 s9, s9, 5 1824; SI-NEXT: s_and_b32 s9, s9, 0xff 1825; SI-NEXT: s_lshl_b32 s9, s9, 16 1826; SI-NEXT: s_or_b32 s8, s8, s9 1827; SI-NEXT: s_lshr_b32 s9, s5, 8 1828; SI-NEXT: s_cmp_lg_u32 s10, 5 1829; SI-NEXT: s_cselect_b32 s9, s9, 5 1830; SI-NEXT: s_lshl_b32 s9, s9, 8 1831; SI-NEXT: s_cmp_lg_u32 s10, 4 1832; SI-NEXT: s_cselect_b32 s5, s5, 5 1833; SI-NEXT: s_and_b32 s5, s5, 0xff 1834; SI-NEXT: s_or_b32 s5, s5, s9 1835; SI-NEXT: s_and_b32 s5, s5, 0xffff 1836; SI-NEXT: s_or_b32 s5, s5, s8 1837; SI-NEXT: s_lshr_b32 s8, s4, 24 1838; SI-NEXT: s_cmp_lg_u32 s10, 3 1839; SI-NEXT: s_cselect_b32 s8, s8, 5 1840; SI-NEXT: s_lshl_b32 s8, s8, 24 1841; SI-NEXT: s_lshr_b32 s9, s4, 16 1842; SI-NEXT: s_cmp_lg_u32 s10, 2 1843; SI-NEXT: s_cselect_b32 s9, s9, 5 1844; SI-NEXT: s_and_b32 s9, s9, 0xff 1845; SI-NEXT: s_lshl_b32 s9, s9, 16 1846; SI-NEXT: s_or_b32 s8, s8, s9 1847; SI-NEXT: s_lshr_b32 s9, s4, 8 1848; SI-NEXT: s_cmp_lg_u32 s10, 1 1849; SI-NEXT: s_cselect_b32 s9, s9, 5 1850; SI-NEXT: s_lshl_b32 s9, s9, 8 1851; SI-NEXT: s_cmp_lg_u32 s10, 0 1852; SI-NEXT: s_cselect_b32 s4, s4, 5 1853; SI-NEXT: s_and_b32 s4, s4, 0xff 1854; SI-NEXT: s_or_b32 s4, s4, s9 1855; SI-NEXT: s_and_b32 s4, s4, 0xffff 1856; SI-NEXT: s_or_b32 s4, s4, s8 1857; SI-NEXT: v_mov_b32_e32 v0, s4 1858; SI-NEXT: v_mov_b32_e32 v1, s5 1859; SI-NEXT: v_mov_b32_e32 v2, s6 1860; SI-NEXT: v_mov_b32_e32 v3, s7 1861; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1862; SI-NEXT: s_endpgm 1863; 1864; VI-LABEL: dynamic_insertelement_v16i8: 1865; VI: ; %bb.0: 1866; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10 1867; VI-NEXT: s_load_dword s10, s[8:9], 0x20 1868; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1869; VI-NEXT: s_mov_b32 s3, 0x1100f000 1870; VI-NEXT: s_mov_b32 s2, -1 1871; VI-NEXT: s_waitcnt lgkmcnt(0) 1872; VI-NEXT: s_lshr_b32 s8, s7, 24 1873; VI-NEXT: s_cmp_lg_u32 s10, 15 1874; VI-NEXT: s_cselect_b32 s8, s8, 5 1875; VI-NEXT: s_lshl_b32 s8, s8, 8 1876; VI-NEXT: s_lshr_b32 s9, s7, 16 1877; VI-NEXT: s_cmp_lg_u32 s10, 14 1878; VI-NEXT: s_cselect_b32 s9, s9, 5 1879; VI-NEXT: s_and_b32 s9, s9, 0xff 1880; VI-NEXT: s_or_b32 s8, s9, s8 1881; VI-NEXT: s_lshl_b32 s8, s8, 16 1882; VI-NEXT: s_lshr_b32 s9, s7, 8 1883; VI-NEXT: s_cmp_lg_u32 s10, 13 1884; VI-NEXT: s_cselect_b32 s9, s9, 5 1885; VI-NEXT: s_lshl_b32 s9, s9, 8 1886; VI-NEXT: s_cmp_lg_u32 s10, 12 1887; VI-NEXT: s_cselect_b32 s7, s7, 5 1888; VI-NEXT: s_and_b32 s7, s7, 0xff 1889; VI-NEXT: s_or_b32 s7, s7, s9 1890; VI-NEXT: s_and_b32 s7, s7, 0xffff 1891; VI-NEXT: s_or_b32 s7, s7, s8 1892; VI-NEXT: s_lshr_b32 s8, s6, 24 1893; VI-NEXT: s_cmp_lg_u32 s10, 11 1894; VI-NEXT: s_cselect_b32 s8, s8, 5 1895; VI-NEXT: s_lshl_b32 s8, s8, 8 1896; VI-NEXT: s_lshr_b32 s9, s6, 16 1897; VI-NEXT: s_cmp_lg_u32 s10, 10 1898; VI-NEXT: s_cselect_b32 s9, s9, 5 1899; VI-NEXT: s_and_b32 s9, s9, 0xff 1900; VI-NEXT: s_or_b32 s8, s9, s8 1901; VI-NEXT: s_lshl_b32 s8, s8, 16 1902; VI-NEXT: s_lshr_b32 s9, s6, 8 1903; VI-NEXT: s_cmp_lg_u32 s10, 9 1904; VI-NEXT: s_cselect_b32 s9, s9, 5 1905; VI-NEXT: s_lshl_b32 s9, s9, 8 1906; VI-NEXT: s_cmp_lg_u32 s10, 8 1907; VI-NEXT: s_cselect_b32 s6, s6, 5 1908; VI-NEXT: s_and_b32 s6, s6, 0xff 1909; VI-NEXT: s_or_b32 s6, s6, s9 1910; VI-NEXT: s_and_b32 s6, s6, 0xffff 1911; VI-NEXT: s_or_b32 s6, s6, s8 1912; VI-NEXT: s_lshr_b32 s8, s5, 24 1913; VI-NEXT: s_cmp_lg_u32 s10, 7 1914; VI-NEXT: s_cselect_b32 s8, s8, 5 1915; VI-NEXT: s_lshl_b32 s8, s8, 8 1916; VI-NEXT: s_lshr_b32 s9, s5, 16 1917; VI-NEXT: s_cmp_lg_u32 s10, 6 1918; VI-NEXT: s_cselect_b32 s9, s9, 5 1919; VI-NEXT: s_and_b32 s9, s9, 0xff 1920; VI-NEXT: s_or_b32 s8, s9, s8 1921; VI-NEXT: s_lshl_b32 s8, s8, 16 1922; VI-NEXT: s_lshr_b32 s9, s5, 8 1923; VI-NEXT: s_cmp_lg_u32 s10, 5 1924; VI-NEXT: s_cselect_b32 s9, s9, 5 1925; VI-NEXT: s_lshl_b32 s9, s9, 8 1926; VI-NEXT: s_cmp_lg_u32 s10, 4 1927; VI-NEXT: s_cselect_b32 s5, s5, 5 1928; VI-NEXT: s_and_b32 s5, s5, 0xff 1929; VI-NEXT: s_or_b32 s5, s5, s9 1930; VI-NEXT: s_and_b32 s5, s5, 0xffff 1931; VI-NEXT: s_or_b32 s5, s5, s8 1932; VI-NEXT: s_lshr_b32 s8, s4, 24 1933; VI-NEXT: s_cmp_lg_u32 s10, 3 1934; VI-NEXT: s_cselect_b32 s8, s8, 5 1935; VI-NEXT: s_lshl_b32 s8, s8, 8 1936; VI-NEXT: s_lshr_b32 s9, s4, 16 1937; VI-NEXT: s_cmp_lg_u32 s10, 2 1938; VI-NEXT: s_cselect_b32 s9, s9, 5 1939; VI-NEXT: s_and_b32 s9, s9, 0xff 1940; VI-NEXT: s_or_b32 s8, s9, s8 1941; VI-NEXT: s_lshl_b32 s8, s8, 16 1942; VI-NEXT: s_lshr_b32 s9, s4, 8 1943; VI-NEXT: s_cmp_lg_u32 s10, 1 1944; VI-NEXT: s_cselect_b32 s9, s9, 5 1945; VI-NEXT: s_lshl_b32 s9, s9, 8 1946; VI-NEXT: s_cmp_lg_u32 s10, 0 1947; VI-NEXT: s_cselect_b32 s4, s4, 5 1948; VI-NEXT: s_and_b32 s4, s4, 0xff 1949; VI-NEXT: s_or_b32 s4, s4, s9 1950; VI-NEXT: s_and_b32 s4, s4, 0xffff 1951; VI-NEXT: s_or_b32 s4, s4, s8 1952; VI-NEXT: v_mov_b32_e32 v0, s4 1953; VI-NEXT: v_mov_b32_e32 v1, s5 1954; VI-NEXT: v_mov_b32_e32 v2, s6 1955; VI-NEXT: v_mov_b32_e32 v3, s7 1956; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1957; VI-NEXT: s_endpgm 1958 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 1959 store <16 x i8> %vecins, ptr addrspace(1) %out, align 16 1960 ret void 1961} 1962 1963; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 1964; the compiler doesn't crash. 1965define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) { 1966; SI-LABEL: insert_split_bb: 1967; SI: ; %bb.0: ; %entry 1968; SI-NEXT: s_load_dword s4, s[8:9], 0x4 1969; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1970; SI-NEXT: s_waitcnt lgkmcnt(0) 1971; SI-NEXT: s_cmp_lg_u32 s4, 0 1972; SI-NEXT: s_cbranch_scc0 .LBB42_4 1973; SI-NEXT: ; %bb.1: ; %else 1974; SI-NEXT: s_load_dword s5, s[2:3], 0x1 1975; SI-NEXT: s_mov_b64 s[6:7], 0 1976; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] 1977; SI-NEXT: s_waitcnt lgkmcnt(0) 1978; SI-NEXT: s_mov_b64 vcc, vcc 1979; SI-NEXT: s_cbranch_vccnz .LBB42_3 1980; SI-NEXT: .LBB42_2: ; %if 1981; SI-NEXT: s_load_dword s5, s[2:3], 0x0 1982; SI-NEXT: .LBB42_3: ; %endif 1983; SI-NEXT: s_waitcnt lgkmcnt(0) 1984; SI-NEXT: v_mov_b32_e32 v0, s4 1985; SI-NEXT: s_mov_b32 s3, 0x100f000 1986; SI-NEXT: s_mov_b32 s2, -1 1987; SI-NEXT: v_mov_b32_e32 v1, s5 1988; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1989; SI-NEXT: s_endpgm 1990; SI-NEXT: .LBB42_4: 1991; SI-NEXT: s_branch .LBB42_2 1992; 1993; VI-LABEL: insert_split_bb: 1994; VI: ; %bb.0: ; %entry 1995; VI-NEXT: s_load_dword s4, s[8:9], 0x10 1996; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1997; VI-NEXT: s_waitcnt lgkmcnt(0) 1998; VI-NEXT: s_cmp_lg_u32 s4, 0 1999; VI-NEXT: s_cbranch_scc0 .LBB42_4 2000; VI-NEXT: ; %bb.1: ; %else 2001; VI-NEXT: s_load_dword s5, s[2:3], 0x4 2002; VI-NEXT: s_cbranch_execnz .LBB42_3 2003; VI-NEXT: .LBB42_2: ; %if 2004; VI-NEXT: s_waitcnt lgkmcnt(0) 2005; VI-NEXT: s_load_dword s5, s[2:3], 0x0 2006; VI-NEXT: .LBB42_3: ; %endif 2007; VI-NEXT: s_waitcnt lgkmcnt(0) 2008; VI-NEXT: v_mov_b32_e32 v0, s4 2009; VI-NEXT: s_mov_b32 s3, 0x1100f000 2010; VI-NEXT: s_mov_b32 s2, -1 2011; VI-NEXT: v_mov_b32_e32 v1, s5 2012; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2013; VI-NEXT: s_endpgm 2014; VI-NEXT: .LBB42_4: 2015; VI-NEXT: s_branch .LBB42_2 2016entry: 2017 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 2018 %1 = icmp eq i32 %a, 0 2019 br i1 %1, label %if, label %else 2020 2021if: 2022 %2 = load i32, ptr addrspace(1) %in 2023 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 2024 br label %endif 2025 2026else: 2027 %4 = getelementptr i32, ptr addrspace(1) %in, i32 1 2028 %5 = load i32, ptr addrspace(1) %4 2029 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 2030 br label %endif 2031 2032endif: 2033 %7 = phi <2 x i32> [%3, %if], [%6, %else] 2034 store <2 x i32> %7, ptr addrspace(1) %out 2035 ret void 2036} 2037 2038define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 2039; SI-LABEL: dynamic_insertelement_v2f64: 2040; SI: ; %bb.0: 2041; SI-NEXT: s_load_dword s10, s[8:9], 0x18 2042; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0xc 2043; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2044; SI-NEXT: s_mov_b32 s7, 0x100f000 2045; SI-NEXT: s_mov_b32 s6, -1 2046; SI-NEXT: s_waitcnt lgkmcnt(0) 2047; SI-NEXT: s_cmp_eq_u32 s10, 1 2048; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3 2049; SI-NEXT: s_cselect_b32 s2, 0, s2 2050; SI-NEXT: s_cmp_eq_u32 s10, 0 2051; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1 2052; SI-NEXT: s_cselect_b32 s0, 0, s0 2053; SI-NEXT: v_mov_b32_e32 v0, s0 2054; SI-NEXT: v_mov_b32_e32 v1, s1 2055; SI-NEXT: v_mov_b32_e32 v2, s2 2056; SI-NEXT: v_mov_b32_e32 v3, s3 2057; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2058; SI-NEXT: s_endpgm 2059; 2060; VI-LABEL: dynamic_insertelement_v2f64: 2061; VI: ; %bb.0: 2062; VI-NEXT: s_load_dword s10, s[8:9], 0x60 2063; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30 2064; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2065; VI-NEXT: s_mov_b32 s7, 0x1100f000 2066; VI-NEXT: s_mov_b32 s6, -1 2067; VI-NEXT: s_waitcnt lgkmcnt(0) 2068; VI-NEXT: s_cmp_eq_u32 s10, 1 2069; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3 2070; VI-NEXT: s_cselect_b32 s2, 0, s2 2071; VI-NEXT: s_cmp_eq_u32 s10, 0 2072; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1 2073; VI-NEXT: s_cselect_b32 s0, 0, s0 2074; VI-NEXT: v_mov_b32_e32 v0, s0 2075; VI-NEXT: v_mov_b32_e32 v1, s1 2076; VI-NEXT: v_mov_b32_e32 v2, s2 2077; VI-NEXT: v_mov_b32_e32 v3, s3 2078; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2079; VI-NEXT: s_endpgm 2080 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 2081 store <2 x double> %vecins, ptr addrspace(1) %out, align 16 2082 ret void 2083} 2084 2085define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind { 2086; SI-LABEL: dynamic_insertelement_v2i64: 2087; SI: ; %bb.0: 2088; SI-NEXT: s_load_dword s10, s[8:9], 0x8 2089; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 2090; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2091; SI-NEXT: s_mov_b32 s7, 0x100f000 2092; SI-NEXT: s_mov_b32 s6, -1 2093; SI-NEXT: s_waitcnt lgkmcnt(0) 2094; SI-NEXT: s_cmp_eq_u32 s10, 1 2095; SI-NEXT: s_cselect_b32 s3, 0, s3 2096; SI-NEXT: s_cselect_b32 s2, 5, s2 2097; SI-NEXT: s_cmp_eq_u32 s10, 0 2098; SI-NEXT: s_cselect_b32 s1, 0, s1 2099; SI-NEXT: s_cselect_b32 s0, 5, s0 2100; SI-NEXT: v_mov_b32_e32 v0, s0 2101; SI-NEXT: v_mov_b32_e32 v1, s1 2102; SI-NEXT: v_mov_b32_e32 v2, s2 2103; SI-NEXT: v_mov_b32_e32 v3, s3 2104; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2105; SI-NEXT: s_endpgm 2106; 2107; VI-LABEL: dynamic_insertelement_v2i64: 2108; VI: ; %bb.0: 2109; VI-NEXT: s_load_dword s10, s[8:9], 0x20 2110; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 2111; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2112; VI-NEXT: s_mov_b32 s7, 0x1100f000 2113; VI-NEXT: s_mov_b32 s6, -1 2114; VI-NEXT: s_waitcnt lgkmcnt(0) 2115; VI-NEXT: s_cmp_eq_u32 s10, 1 2116; VI-NEXT: s_cselect_b32 s3, 0, s3 2117; VI-NEXT: s_cselect_b32 s2, 5, s2 2118; VI-NEXT: s_cmp_eq_u32 s10, 0 2119; VI-NEXT: s_cselect_b32 s1, 0, s1 2120; VI-NEXT: s_cselect_b32 s0, 5, s0 2121; VI-NEXT: v_mov_b32_e32 v0, s0 2122; VI-NEXT: v_mov_b32_e32 v1, s1 2123; VI-NEXT: v_mov_b32_e32 v2, s2 2124; VI-NEXT: v_mov_b32_e32 v3, s3 2125; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2126; VI-NEXT: s_endpgm 2127 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 2128 store <2 x i64> %vecins, ptr addrspace(1) %out, align 8 2129 ret void 2130} 2131 2132define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind { 2133; SI-LABEL: dynamic_insertelement_v3i64: 2134; SI: ; %bb.0: 2135; SI-NEXT: s_load_dword s10, s[8:9], 0x10 2136; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2137; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8 2138; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0xc 2139; SI-NEXT: s_mov_b32 s3, 0x100f000 2140; SI-NEXT: s_waitcnt lgkmcnt(0) 2141; SI-NEXT: s_cmp_eq_u32 s10, 1 2142; SI-NEXT: s_mov_b32 s2, -1 2143; SI-NEXT: s_cselect_b32 s7, 0, s7 2144; SI-NEXT: s_cselect_b32 s6, 5, s6 2145; SI-NEXT: s_cmp_eq_u32 s10, 0 2146; SI-NEXT: s_cselect_b32 s5, 0, s5 2147; SI-NEXT: s_cselect_b32 s4, 5, s4 2148; SI-NEXT: s_cmp_eq_u32 s10, 2 2149; SI-NEXT: s_cselect_b32 s9, 0, s9 2150; SI-NEXT: s_cselect_b32 s8, 5, s8 2151; SI-NEXT: v_mov_b32_e32 v0, s8 2152; SI-NEXT: v_mov_b32_e32 v1, s9 2153; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 2154; SI-NEXT: v_mov_b32_e32 v0, s4 2155; SI-NEXT: v_mov_b32_e32 v1, s5 2156; SI-NEXT: v_mov_b32_e32 v2, s6 2157; SI-NEXT: v_mov_b32_e32 v3, s7 2158; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2159; SI-NEXT: s_endpgm 2160; 2161; VI-LABEL: dynamic_insertelement_v3i64: 2162; VI: ; %bb.0: 2163; VI-NEXT: s_load_dword s10, s[8:9], 0x40 2164; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2165; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 2166; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x30 2167; VI-NEXT: s_mov_b32 s3, 0x1100f000 2168; VI-NEXT: s_waitcnt lgkmcnt(0) 2169; VI-NEXT: s_cmp_eq_u32 s10, 1 2170; VI-NEXT: s_mov_b32 s2, -1 2171; VI-NEXT: s_cselect_b32 s7, 0, s7 2172; VI-NEXT: s_cselect_b32 s6, 5, s6 2173; VI-NEXT: s_cmp_eq_u32 s10, 0 2174; VI-NEXT: s_cselect_b32 s5, 0, s5 2175; VI-NEXT: s_cselect_b32 s4, 5, s4 2176; VI-NEXT: s_cmp_eq_u32 s10, 2 2177; VI-NEXT: s_cselect_b32 s9, 0, s9 2178; VI-NEXT: s_cselect_b32 s8, 5, s8 2179; VI-NEXT: v_mov_b32_e32 v0, s8 2180; VI-NEXT: v_mov_b32_e32 v1, s9 2181; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 2182; VI-NEXT: v_mov_b32_e32 v0, s4 2183; VI-NEXT: v_mov_b32_e32 v1, s5 2184; VI-NEXT: v_mov_b32_e32 v2, s6 2185; VI-NEXT: v_mov_b32_e32 v3, s7 2186; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2187; VI-NEXT: s_endpgm 2188 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 2189 store <3 x i64> %vecins, ptr addrspace(1) %out, align 32 2190 ret void 2191} 2192 2193define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind { 2194; SI-LABEL: dynamic_insertelement_v4f64: 2195; SI: ; %bb.0: 2196; SI-NEXT: s_load_dword s12, s[8:9], 0x10 2197; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 2198; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 2199; SI-NEXT: s_mov_b32 s11, 0x100f000 2200; SI-NEXT: s_mov_b32 s10, -1 2201; SI-NEXT: s_waitcnt lgkmcnt(0) 2202; SI-NEXT: s_cmp_eq_u32 s12, 1 2203; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3 2204; SI-NEXT: s_cselect_b32 s2, 0, s2 2205; SI-NEXT: s_cmp_eq_u32 s12, 0 2206; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1 2207; SI-NEXT: s_cselect_b32 s0, 0, s0 2208; SI-NEXT: s_cmp_eq_u32 s12, 3 2209; SI-NEXT: s_cselect_b32 s7, 0x40200000, s7 2210; SI-NEXT: s_cselect_b32 s6, 0, s6 2211; SI-NEXT: s_cmp_eq_u32 s12, 2 2212; SI-NEXT: s_cselect_b32 s5, 0x40200000, s5 2213; SI-NEXT: s_cselect_b32 s4, 0, s4 2214; SI-NEXT: v_mov_b32_e32 v0, s4 2215; SI-NEXT: v_mov_b32_e32 v1, s5 2216; SI-NEXT: v_mov_b32_e32 v2, s6 2217; SI-NEXT: v_mov_b32_e32 v3, s7 2218; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 2219; SI-NEXT: s_nop 0 2220; SI-NEXT: v_mov_b32_e32 v0, s0 2221; SI-NEXT: v_mov_b32_e32 v1, s1 2222; SI-NEXT: v_mov_b32_e32 v2, s2 2223; SI-NEXT: v_mov_b32_e32 v3, s3 2224; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2225; SI-NEXT: s_endpgm 2226; 2227; VI-LABEL: dynamic_insertelement_v4f64: 2228; VI: ; %bb.0: 2229; VI-NEXT: s_load_dword s12, s[8:9], 0x40 2230; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 2231; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 2232; VI-NEXT: s_mov_b32 s11, 0x1100f000 2233; VI-NEXT: s_mov_b32 s10, -1 2234; VI-NEXT: s_waitcnt lgkmcnt(0) 2235; VI-NEXT: s_cmp_eq_u32 s12, 1 2236; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3 2237; VI-NEXT: s_cselect_b32 s2, 0, s2 2238; VI-NEXT: s_cmp_eq_u32 s12, 0 2239; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1 2240; VI-NEXT: s_cselect_b32 s0, 0, s0 2241; VI-NEXT: s_cmp_eq_u32 s12, 3 2242; VI-NEXT: s_cselect_b32 s7, 0x40200000, s7 2243; VI-NEXT: s_cselect_b32 s6, 0, s6 2244; VI-NEXT: s_cmp_eq_u32 s12, 2 2245; VI-NEXT: s_cselect_b32 s5, 0x40200000, s5 2246; VI-NEXT: s_cselect_b32 s4, 0, s4 2247; VI-NEXT: v_mov_b32_e32 v0, s4 2248; VI-NEXT: v_mov_b32_e32 v1, s5 2249; VI-NEXT: v_mov_b32_e32 v2, s6 2250; VI-NEXT: v_mov_b32_e32 v3, s7 2251; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 2252; VI-NEXT: s_nop 0 2253; VI-NEXT: v_mov_b32_e32 v0, s0 2254; VI-NEXT: v_mov_b32_e32 v1, s1 2255; VI-NEXT: v_mov_b32_e32 v2, s2 2256; VI-NEXT: v_mov_b32_e32 v3, s3 2257; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2258; VI-NEXT: s_endpgm 2259 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 2260 store <4 x double> %vecins, ptr addrspace(1) %out, align 16 2261 ret void 2262} 2263 2264define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { 2265; SI-LABEL: dynamic_insertelement_v8f64: 2266; SI: ; %bb.0: 2267; SI-NEXT: s_load_dword s4, s[8:9], 0x20 2268; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 2269; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2270; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 2271; SI-NEXT: s_mov_b32 s3, 0x100f000 2272; SI-NEXT: s_waitcnt lgkmcnt(0) 2273; SI-NEXT: s_lshl_b32 s4, s4, 1 2274; SI-NEXT: v_mov_b32_e32 v0, s12 2275; SI-NEXT: v_mov_b32_e32 v1, s13 2276; SI-NEXT: v_mov_b32_e32 v2, s14 2277; SI-NEXT: v_mov_b32_e32 v3, s15 2278; SI-NEXT: v_mov_b32_e32 v4, s16 2279; SI-NEXT: v_mov_b32_e32 v5, s17 2280; SI-NEXT: v_mov_b32_e32 v6, s18 2281; SI-NEXT: v_mov_b32_e32 v7, s19 2282; SI-NEXT: v_mov_b32_e32 v8, s20 2283; SI-NEXT: v_mov_b32_e32 v9, s21 2284; SI-NEXT: v_mov_b32_e32 v10, s22 2285; SI-NEXT: v_mov_b32_e32 v11, s23 2286; SI-NEXT: v_mov_b32_e32 v12, s24 2287; SI-NEXT: v_mov_b32_e32 v13, s25 2288; SI-NEXT: v_mov_b32_e32 v14, s26 2289; SI-NEXT: v_mov_b32_e32 v15, s27 2290; SI-NEXT: s_mov_b32 m0, s4 2291; SI-NEXT: v_movreld_b32_e32 v0, 0 2292; SI-NEXT: s_mov_b32 s2, -1 2293; SI-NEXT: v_movreld_b32_e32 v1, v16 2294; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 2295; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 2296; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 2297; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2298; SI-NEXT: s_endpgm 2299; 2300; VI-LABEL: dynamic_insertelement_v8f64: 2301; VI: ; %bb.0: 2302; VI-NEXT: s_load_dword s4, s[8:9], 0x80 2303; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 2304; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2305; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 2306; VI-NEXT: s_mov_b32 s3, 0x1100f000 2307; VI-NEXT: s_waitcnt lgkmcnt(0) 2308; VI-NEXT: s_lshl_b32 s4, s4, 1 2309; VI-NEXT: v_mov_b32_e32 v0, s12 2310; VI-NEXT: v_mov_b32_e32 v1, s13 2311; VI-NEXT: v_mov_b32_e32 v2, s14 2312; VI-NEXT: v_mov_b32_e32 v3, s15 2313; VI-NEXT: v_mov_b32_e32 v4, s16 2314; VI-NEXT: v_mov_b32_e32 v5, s17 2315; VI-NEXT: v_mov_b32_e32 v6, s18 2316; VI-NEXT: v_mov_b32_e32 v7, s19 2317; VI-NEXT: v_mov_b32_e32 v8, s20 2318; VI-NEXT: v_mov_b32_e32 v9, s21 2319; VI-NEXT: v_mov_b32_e32 v10, s22 2320; VI-NEXT: v_mov_b32_e32 v11, s23 2321; VI-NEXT: v_mov_b32_e32 v12, s24 2322; VI-NEXT: v_mov_b32_e32 v13, s25 2323; VI-NEXT: v_mov_b32_e32 v14, s26 2324; VI-NEXT: v_mov_b32_e32 v15, s27 2325; VI-NEXT: s_mov_b32 m0, s4 2326; VI-NEXT: v_movreld_b32_e32 v0, 0 2327; VI-NEXT: s_mov_b32 s2, -1 2328; VI-NEXT: v_movreld_b32_e32 v1, v16 2329; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 2330; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 2331; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 2332; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2333; VI-NEXT: s_endpgm 2334 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 2335 store <8 x double> %vecins, ptr addrspace(1) %out, align 16 2336 ret void 2337} 2338 2339declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 2340 2341attributes #0 = { nounwind } 2342attributes #1 = { nounwind readnone } 2343