1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 6 7define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { 8; GCN-LABEL: uniform_vec_0_i16: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dword s2, s[4:5], 0xb 11; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 12; GCN-NEXT: s_mov_b32 s3, 0xf000 13; GCN-NEXT: s_waitcnt lgkmcnt(0) 14; GCN-NEXT: s_lshl_b32 s4, s2, 16 15; GCN-NEXT: s_mov_b32 s2, -1 16; GCN-NEXT: v_mov_b32_e32 v0, s4 17; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 18; GCN-NEXT: s_endpgm 19; 20; GFX9-LABEL: uniform_vec_0_i16: 21; GFX9: ; %bb.0: 22; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 23; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 24; GFX9-NEXT: v_mov_b32_e32 v0, 0 25; GFX9-NEXT: s_waitcnt lgkmcnt(0) 26; GFX9-NEXT: s_lshl_b32 s2, s2, 16 27; GFX9-NEXT: v_mov_b32_e32 v1, s2 28; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 29; GFX9-NEXT: s_endpgm 30; 31; GFX906-LABEL: uniform_vec_0_i16: 32; GFX906: ; %bb.0: 33; GFX906-NEXT: s_load_dword s2, s[4:5], 0x2c 34; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 35; GFX906-NEXT: v_mov_b32_e32 v0, 0 36; GFX906-NEXT: s_waitcnt lgkmcnt(0) 37; GFX906-NEXT: s_lshl_b32 s2, s2, 16 38; GFX906-NEXT: v_mov_b32_e32 v1, s2 39; GFX906-NEXT: global_store_dword v0, v1, s[0:1] 40; GFX906-NEXT: s_endpgm 41; 42; GFX11-LABEL: uniform_vec_0_i16: 43; GFX11: ; %bb.0: 44; GFX11-NEXT: s_clause 0x1 45; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 46; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 47; GFX11-NEXT: s_waitcnt lgkmcnt(0) 48; GFX11-NEXT: s_lshl_b32 s2, s2, 16 49; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 50; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 51; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 52; GFX11-NEXT: s_endpgm 53 %tmp = insertelement <2 x i16> undef, i16 0, i32 0 54 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 55 %val = bitcast <2 x i16> %vec to i32 56 store i32 %val, ptr addrspace(1) %out, align 4 57 ret void 58} 59 60define i32 @divergent_vec_0_i16(i16 %a) { 61; GCN-LABEL: divergent_vec_0_i16: 62; GCN: ; %bb.0: 63; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 64; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 65; GCN-NEXT: s_setpc_b64 s[30:31] 66; 67; GFX9-LABEL: divergent_vec_0_i16: 68; GFX9: ; %bb.0: 69; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 71; GFX9-NEXT: s_setpc_b64 s[30:31] 72; 73; GFX906-LABEL: divergent_vec_0_i16: 74; GFX906: ; %bb.0: 75; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 76; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0 77; GFX906-NEXT: s_setpc_b64 s[30:31] 78; 79; GFX11-LABEL: divergent_vec_0_i16: 80; GFX11: ; %bb.0: 81; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 82; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 83; GFX11-NEXT: s_setpc_b64 s[30:31] 84 %tmp = insertelement <2 x i16> undef, i16 0, i32 0 85 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 86 %val = bitcast <2 x i16> %vec to i32 87 ret i32 %val 88} 89 90define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { 91; GCN-LABEL: uniform_vec_i16_0: 92; GCN: ; %bb.0: 93; GCN-NEXT: s_load_dword s2, s[4:5], 0xb 94; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 95; GCN-NEXT: s_mov_b32 s3, 0xf000 96; GCN-NEXT: s_waitcnt lgkmcnt(0) 97; GCN-NEXT: s_and_b32 s4, s2, 0xffff 98; GCN-NEXT: s_mov_b32 s2, -1 99; GCN-NEXT: v_mov_b32_e32 v0, s4 100; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 101; GCN-NEXT: s_endpgm 102; 103; GFX9-LABEL: uniform_vec_i16_0: 104; GFX9: ; %bb.0: 105; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 106; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 107; GFX9-NEXT: v_mov_b32_e32 v0, 0 108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 109; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 110; GFX9-NEXT: v_mov_b32_e32 v1, s2 111; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 112; GFX9-NEXT: s_endpgm 113; 114; GFX906-LABEL: uniform_vec_i16_0: 115; GFX906: ; %bb.0: 116; GFX906-NEXT: s_load_dword s2, s[4:5], 0x2c 117; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 118; GFX906-NEXT: v_mov_b32_e32 v0, 0 119; GFX906-NEXT: s_waitcnt lgkmcnt(0) 120; GFX906-NEXT: s_and_b32 s2, 0xffff, s2 121; GFX906-NEXT: v_mov_b32_e32 v1, s2 122; GFX906-NEXT: global_store_dword v0, v1, s[0:1] 123; GFX906-NEXT: s_endpgm 124; 125; GFX11-LABEL: uniform_vec_i16_0: 126; GFX11: ; %bb.0: 127; GFX11-NEXT: s_clause 0x1 128; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 129; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 130; GFX11-NEXT: s_waitcnt lgkmcnt(0) 131; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 132; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 133; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 134; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 135; GFX11-NEXT: s_endpgm 136 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 137 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 138 %val = bitcast <2 x i16> %vec to i32 139 store i32 %val, ptr addrspace(1) %out, align 4 140 ret void 141} 142 143define i32 @divergent_vec_i16_0(i16 %a) { 144; GCN-LABEL: divergent_vec_i16_0: 145; GCN: ; %bb.0: 146; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 148; GCN-NEXT: s_setpc_b64 s[30:31] 149; 150; GFX9-LABEL: divergent_vec_i16_0: 151; GFX9: ; %bb.0: 152; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 154; GFX9-NEXT: s_setpc_b64 s[30:31] 155; 156; GFX906-LABEL: divergent_vec_i16_0: 157; GFX906: ; %bb.0: 158; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 159; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 160; GFX906-NEXT: s_setpc_b64 s[30:31] 161; 162; GFX11-LABEL: divergent_vec_i16_0: 163; GFX11: ; %bb.0: 164; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 166; GFX11-NEXT: s_setpc_b64 s[30:31] 167 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 168 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 169 %val = bitcast <2 x i16> %vec to i32 170 ret i32 %val 171} 172 173define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { 174; GCN-LABEL: uniform_vec_f16_0: 175; GCN: ; %bb.0: 176; GCN-NEXT: s_load_dword s2, s[4:5], 0xb 177; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 178; GCN-NEXT: s_mov_b32 s3, 0xf000 179; GCN-NEXT: s_waitcnt lgkmcnt(0) 180; GCN-NEXT: s_and_b32 s4, s2, 0xffff 181; GCN-NEXT: s_mov_b32 s2, -1 182; GCN-NEXT: v_mov_b32_e32 v0, s4 183; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 184; GCN-NEXT: s_endpgm 185; 186; GFX9-LABEL: uniform_vec_f16_0: 187; GFX9: ; %bb.0: 188; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 189; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 190; GFX9-NEXT: v_mov_b32_e32 v0, 0 191; GFX9-NEXT: s_waitcnt lgkmcnt(0) 192; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 193; GFX9-NEXT: v_mov_b32_e32 v1, s2 194; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 195; GFX9-NEXT: s_endpgm 196; 197; GFX906-LABEL: uniform_vec_f16_0: 198; GFX906: ; %bb.0: 199; GFX906-NEXT: s_load_dword s2, s[4:5], 0x2c 200; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 201; GFX906-NEXT: v_mov_b32_e32 v0, 0 202; GFX906-NEXT: s_waitcnt lgkmcnt(0) 203; GFX906-NEXT: s_and_b32 s2, 0xffff, s2 204; GFX906-NEXT: v_mov_b32_e32 v1, s2 205; GFX906-NEXT: global_store_dword v0, v1, s[0:1] 206; GFX906-NEXT: s_endpgm 207; 208; GFX11-LABEL: uniform_vec_f16_0: 209; GFX11: ; %bb.0: 210; GFX11-NEXT: s_clause 0x1 211; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 212; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 213; GFX11-NEXT: s_waitcnt lgkmcnt(0) 214; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 215; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 216; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 217; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 218; GFX11-NEXT: s_endpgm 219 %tmp = insertelement <2 x half> undef, half %a, i32 0 220 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 221 %val = bitcast <2 x half> %vec to float 222 store float %val, ptr addrspace(1) %out, align 4 223 ret void 224} 225 226define float @divergent_vec_f16_0(half %a) { 227; GCN-LABEL: divergent_vec_f16_0: 228; GCN: ; %bb.0: 229; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 231; GCN-NEXT: s_setpc_b64 s[30:31] 232; 233; GFX9-LABEL: divergent_vec_f16_0: 234; GFX9: ; %bb.0: 235; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 236; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 237; GFX9-NEXT: s_setpc_b64 s[30:31] 238; 239; GFX906-LABEL: divergent_vec_f16_0: 240; GFX906: ; %bb.0: 241; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 242; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 243; GFX906-NEXT: s_setpc_b64 s[30:31] 244; 245; GFX11-LABEL: divergent_vec_f16_0: 246; GFX11: ; %bb.0: 247; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 249; GFX11-NEXT: s_setpc_b64 s[30:31] 250 %tmp = insertelement <2 x half> undef, half %a, i32 0 251 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 252 %val = bitcast <2 x half> %vec to float 253 ret float %val 254} 255 256define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { 257; GCN-LABEL: uniform_vec_i16_LL: 258; GCN: ; %bb.0: 259; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 260; GCN-NEXT: s_waitcnt lgkmcnt(0) 261; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 262; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 263; GCN-NEXT: s_waitcnt lgkmcnt(0) 264; GCN-NEXT: s_and_b32 s0, s0, 0xffff 265; GCN-NEXT: s_lshl_b32 s1, s1, 16 266; GCN-NEXT: s_or_b32 s0, s0, s1 267; GCN-NEXT: ;;#ASMSTART 268; GCN-NEXT: ; use s0 269; GCN-NEXT: ;;#ASMEND 270; GCN-NEXT: s_endpgm 271; 272; GFX9-LABEL: uniform_vec_i16_LL: 273; GFX9: ; %bb.0: 274; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 276; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 277; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 279; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 280; GFX9-NEXT: ;;#ASMSTART 281; GFX9-NEXT: ; use s0 282; GFX9-NEXT: ;;#ASMEND 283; GFX9-NEXT: s_endpgm 284; 285; GFX906-LABEL: uniform_vec_i16_LL: 286; GFX906: ; %bb.0: 287; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 288; GFX906-NEXT: s_waitcnt lgkmcnt(0) 289; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 290; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 291; GFX906-NEXT: s_waitcnt lgkmcnt(0) 292; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5 293; GFX906-NEXT: ;;#ASMSTART 294; GFX906-NEXT: ; use s0 295; GFX906-NEXT: ;;#ASMEND 296; GFX906-NEXT: s_endpgm 297; 298; GFX11-LABEL: uniform_vec_i16_LL: 299; GFX11: ; %bb.0: 300; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 301; GFX11-NEXT: s_waitcnt lgkmcnt(0) 302; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 303; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 304; GFX11-NEXT: s_waitcnt lgkmcnt(0) 305; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 306; GFX11-NEXT: ;;#ASMSTART 307; GFX11-NEXT: ; use s0 308; GFX11-NEXT: ;;#ASMEND 309; GFX11-NEXT: s_endpgm 310 %val0 = load volatile i32, ptr addrspace(4) %in0 311 %val1 = load volatile i32, ptr addrspace(4) %in1 312 %lo = trunc i32 %val0 to i16 313 %hi = trunc i32 %val1 to i16 314 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 315 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 316 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 317 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 318 ret void 319} 320 321define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { 322; GCN-LABEL: divergent_vec_i16_LL: 323; GCN: ; %bb.0: 324; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 326; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 327; GCN-NEXT: v_or_b32_e32 v0, v0, v1 328; GCN-NEXT: s_setpc_b64 s[30:31] 329; 330; GFX9-LABEL: divergent_vec_i16_LL: 331; GFX9: ; %bb.0: 332; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 333; GFX9-NEXT: s_mov_b32 s4, 0x5040100 334; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 335; GFX9-NEXT: s_setpc_b64 s[30:31] 336; 337; GFX906-LABEL: divergent_vec_i16_LL: 338; GFX906: ; %bb.0: 339; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 340; GFX906-NEXT: s_mov_b32 s4, 0x5040100 341; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 342; GFX906-NEXT: s_setpc_b64 s[30:31] 343; 344; GFX11-LABEL: divergent_vec_i16_LL: 345; GFX11: ; %bb.0: 346; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 347; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 348; GFX11-NEXT: s_setpc_b64 s[30:31] 349 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 350 %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1 351 %val = bitcast <2 x i16> %vec to i32 352 ret i32 %val 353} 354 355define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { 356; GCN-LABEL: uniform_vec_i16_LH: 357; GCN: ; %bb.0: 358; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 359; GCN-NEXT: s_mov_b32 s7, 0xf000 360; GCN-NEXT: s_waitcnt lgkmcnt(0) 361; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 362; GCN-NEXT: s_and_b32 s2, s2, 0xffff 363; GCN-NEXT: s_or_b32 s2, s2, s3 364; GCN-NEXT: s_mov_b32 s6, -1 365; GCN-NEXT: s_mov_b32 s4, s0 366; GCN-NEXT: s_mov_b32 s5, s1 367; GCN-NEXT: v_mov_b32_e32 v0, s2 368; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 369; GCN-NEXT: s_endpgm 370; 371; GFX9-LABEL: uniform_vec_i16_LH: 372; GFX9: ; %bb.0: 373; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 374; GFX9-NEXT: v_mov_b32_e32 v0, 0 375; GFX9-NEXT: s_waitcnt lgkmcnt(0) 376; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 377; GFX9-NEXT: v_mov_b32_e32 v1, s2 378; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 379; GFX9-NEXT: s_endpgm 380; 381; GFX906-LABEL: uniform_vec_i16_LH: 382; GFX906: ; %bb.0: 383; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 384; GFX906-NEXT: v_mov_b32_e32 v0, 0 385; GFX906-NEXT: s_waitcnt lgkmcnt(0) 386; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 387; GFX906-NEXT: v_mov_b32_e32 v1, s2 388; GFX906-NEXT: global_store_dword v0, v1, s[0:1] 389; GFX906-NEXT: s_endpgm 390; 391; GFX11-LABEL: uniform_vec_i16_LH: 392; GFX11: ; %bb.0: 393; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 394; GFX11-NEXT: s_waitcnt lgkmcnt(0) 395; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 396; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 397; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 398; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 399; GFX11-NEXT: s_endpgm 400 %shift = lshr i32 %b, 16 401 %tr = trunc i32 %shift to i16 402 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 403 %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 404 %val = bitcast <2 x i16> %vec to i32 405 store i32 %val, ptr addrspace(1) %out, align 4 406 ret void 407} 408 409define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { 410; GCN-LABEL: divergent_vec_i16_LH: 411; GCN: ; %bb.0: 412; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 413; GCN-NEXT: s_mov_b32 s4, 0xffff 414; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 415; GCN-NEXT: s_setpc_b64 s[30:31] 416; 417; GFX9-LABEL: divergent_vec_i16_LH: 418; GFX9: ; %bb.0: 419; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 420; GFX9-NEXT: s_mov_b32 s4, 0xffff 421; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 422; GFX9-NEXT: s_setpc_b64 s[30:31] 423; 424; GFX906-LABEL: divergent_vec_i16_LH: 425; GFX906: ; %bb.0: 426; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 427; GFX906-NEXT: s_mov_b32 s4, 0xffff 428; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 429; GFX906-NEXT: s_setpc_b64 s[30:31] 430; 431; GFX11-LABEL: divergent_vec_i16_LH: 432; GFX11: ; %bb.0: 433; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 434; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 435; GFX11-NEXT: s_setpc_b64 s[30:31] 436 %shift = lshr i32 %b, 16 437 %tr = trunc i32 %shift to i16 438 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 439 %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 440 %val = bitcast <2 x i16> %vec to i32 441 ret i32 %val 442} 443 444define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { 445; GCN-LABEL: uniform_vec_i16_HH: 446; GCN: ; %bb.0: 447; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 448; GCN-NEXT: s_mov_b32 s7, 0xf000 449; GCN-NEXT: s_mov_b32 s6, -1 450; GCN-NEXT: s_waitcnt lgkmcnt(0) 451; GCN-NEXT: s_mov_b32 s4, s0 452; GCN-NEXT: s_mov_b32 s5, s1 453; GCN-NEXT: s_lshr_b32 s0, s3, 16 454; GCN-NEXT: v_mov_b32_e32 v0, s2 455; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 456; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 457; GCN-NEXT: s_endpgm 458; 459; GFX9-LABEL: uniform_vec_i16_HH: 460; GFX9: ; %bb.0: 461; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 462; GFX9-NEXT: v_mov_b32_e32 v0, 0 463; GFX9-NEXT: s_waitcnt lgkmcnt(0) 464; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 465; GFX9-NEXT: v_mov_b32_e32 v1, s2 466; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 467; GFX9-NEXT: s_endpgm 468; 469; GFX906-LABEL: uniform_vec_i16_HH: 470; GFX906: ; %bb.0: 471; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 472; GFX906-NEXT: v_mov_b32_e32 v0, 0 473; GFX906-NEXT: s_waitcnt lgkmcnt(0) 474; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 475; GFX906-NEXT: v_mov_b32_e32 v1, s2 476; GFX906-NEXT: global_store_dword v0, v1, s[0:1] 477; GFX906-NEXT: s_endpgm 478; 479; GFX11-LABEL: uniform_vec_i16_HH: 480; GFX11: ; %bb.0: 481; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 482; GFX11-NEXT: s_waitcnt lgkmcnt(0) 483; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 484; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 485; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 486; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 487; GFX11-NEXT: s_endpgm 488 %shift_a = lshr i32 %a, 16 489 %tr_a = trunc i32 %shift_a to i16 490 %shift_b = lshr i32 %b, 16 491 %tr_b = trunc i32 %shift_b to i16 492 %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0 493 %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1 494 %val = bitcast <2 x i16> %vec to i32 495 store i32 %val, ptr addrspace(1) %out, align 4 496 ret void 497} 498 499define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { 500; GCN-LABEL: divergent_vec_i16_HH: 501; GCN: ; %bb.0: 502; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 503; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 504; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 505; GCN-NEXT: s_setpc_b64 s[30:31] 506; 507; GFX9-LABEL: divergent_vec_i16_HH: 508; GFX9: ; %bb.0: 509; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 510; GFX9-NEXT: s_mov_b32 s4, 0x7060302 511; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 512; GFX9-NEXT: s_setpc_b64 s[30:31] 513; 514; GFX906-LABEL: divergent_vec_i16_HH: 515; GFX906: ; %bb.0: 516; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 517; GFX906-NEXT: s_mov_b32 s4, 0x7060302 518; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 519; GFX906-NEXT: s_setpc_b64 s[30:31] 520; 521; GFX11-LABEL: divergent_vec_i16_HH: 522; GFX11: ; %bb.0: 523; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 524; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 525; GFX11-NEXT: s_setpc_b64 s[30:31] 526 %shift_a = lshr i32 %a, 16 527 %tr_a = trunc i32 %shift_a to i16 528 %shift_b = lshr i32 %b, 16 529 %tr_b = trunc i32 %shift_b to i16 530 %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0 531 %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1 532 %val = bitcast <2 x i16> %vec to i32 533 ret i32 %val 534} 535 536define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { 537; GCN-LABEL: uniform_vec_f16_LL: 538; GCN: ; %bb.0: 539; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 540; GCN-NEXT: s_waitcnt lgkmcnt(0) 541; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 542; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 543; GCN-NEXT: s_waitcnt lgkmcnt(0) 544; GCN-NEXT: s_and_b32 s0, s0, 0xffff 545; GCN-NEXT: s_lshl_b32 s1, s1, 16 546; GCN-NEXT: s_or_b32 s0, s0, s1 547; GCN-NEXT: ;;#ASMSTART 548; GCN-NEXT: ; use s0 549; GCN-NEXT: ;;#ASMEND 550; GCN-NEXT: s_endpgm 551; 552; GFX9-LABEL: uniform_vec_f16_LL: 553; GFX9: ; %bb.0: 554; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 555; GFX9-NEXT: s_waitcnt lgkmcnt(0) 556; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 557; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 558; GFX9-NEXT: s_waitcnt lgkmcnt(0) 559; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 560; GFX9-NEXT: ;;#ASMSTART 561; GFX9-NEXT: ; use s0 562; GFX9-NEXT: ;;#ASMEND 563; GFX9-NEXT: s_endpgm 564; 565; GFX906-LABEL: uniform_vec_f16_LL: 566; GFX906: ; %bb.0: 567; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 568; GFX906-NEXT: s_waitcnt lgkmcnt(0) 569; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 570; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 571; GFX906-NEXT: s_waitcnt lgkmcnt(0) 572; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5 573; GFX906-NEXT: ;;#ASMSTART 574; GFX906-NEXT: ; use s0 575; GFX906-NEXT: ;;#ASMEND 576; GFX906-NEXT: s_endpgm 577; 578; GFX11-LABEL: uniform_vec_f16_LL: 579; GFX11: ; %bb.0: 580; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 581; GFX11-NEXT: s_waitcnt lgkmcnt(0) 582; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 583; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 584; GFX11-NEXT: s_waitcnt lgkmcnt(0) 585; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 586; GFX11-NEXT: ;;#ASMSTART 587; GFX11-NEXT: ; use s0 588; GFX11-NEXT: ;;#ASMEND 589; GFX11-NEXT: s_endpgm 590 %val0 = load volatile i32, ptr addrspace(4) %in0 591 %val1 = load volatile i32, ptr addrspace(4) %in1 592 %lo.i = trunc i32 %val0 to i16 593 %hi.i = trunc i32 %val1 to i16 594 %lo = bitcast i16 %lo.i to half 595 %hi = bitcast i16 %hi.i to half 596 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 597 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 598 %vec.i32 = bitcast <2 x half> %vec.1 to i32 599 600 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 601 ret void 602} 603 604define float @divergent_vec_f16_LL(half %a, half %b) { 605; GCN-LABEL: divergent_vec_f16_LL: 606; GCN: ; %bb.0: 607; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 608; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 609; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 610; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 611; GCN-NEXT: v_or_b32_e32 v0, v0, v1 612; GCN-NEXT: s_setpc_b64 s[30:31] 613; 614; GFX9-LABEL: divergent_vec_f16_LL: 615; GFX9: ; %bb.0: 616; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 617; GFX9-NEXT: s_mov_b32 s4, 0x5040100 618; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 619; GFX9-NEXT: s_setpc_b64 s[30:31] 620; 621; GFX906-LABEL: divergent_vec_f16_LL: 622; GFX906: ; %bb.0: 623; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 624; GFX906-NEXT: s_mov_b32 s4, 0x5040100 625; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 626; GFX906-NEXT: s_setpc_b64 s[30:31] 627; 628; GFX11-LABEL: divergent_vec_f16_LL: 629; GFX11: ; %bb.0: 630; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 631; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 632; GFX11-NEXT: s_setpc_b64 s[30:31] 633 %tmp = insertelement <2 x half> undef, half %a, i32 0 634 %vec = insertelement <2 x half> %tmp, half %b, i32 1 635 %val = bitcast <2 x half> %vec to float 636 ret float %val 637} 638 639define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 { 640; GCN-LABEL: build_vec_v2i16_undeflo_divergent: 641; GCN: ; %bb.0: ; %entry 642; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 643; GCN-NEXT: s_mov_b32 m0, -1 644; GCN-NEXT: ds_read_u16 v0, v0 645; GCN-NEXT: s_waitcnt lgkmcnt(0) 646; GCN-NEXT: s_setpc_b64 s[30:31] 647; 648; GFX9-LABEL: build_vec_v2i16_undeflo_divergent: 649; GFX9: ; %bb.0: ; %entry 650; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 651; GFX9-NEXT: ds_read_u16_d16 v0, v0 652; GFX9-NEXT: s_waitcnt lgkmcnt(0) 653; GFX9-NEXT: s_setpc_b64 s[30:31] 654; 655; GFX906-LABEL: build_vec_v2i16_undeflo_divergent: 656; GFX906: ; %bb.0: ; %entry 657; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 658; GFX906-NEXT: ds_read_u16 v0, v0 659; GFX906-NEXT: s_waitcnt lgkmcnt(0) 660; GFX906-NEXT: s_setpc_b64 s[30:31] 661; 662; GFX11-LABEL: build_vec_v2i16_undeflo_divergent: 663; GFX11: ; %bb.0: ; %entry 664; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX11-NEXT: ds_load_u16_d16 v0, v0 666; GFX11-NEXT: s_waitcnt lgkmcnt(0) 667; GFX11-NEXT: s_setpc_b64 s[30:31] 668entry: 669 %load = load i16, ptr addrspace(3) %in 670 %build = insertelement <2 x i16> undef, i16 %load, i32 0 671 ret <2 x i16> %build 672} 673 674define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { 675; GCN-LABEL: build_vec_v2i16_undeflo_uniform: 676; GCN: ; %bb.0: ; %entry 677; GCN-NEXT: s_load_dword s2, s[4:5], 0x9 678; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb 679; GCN-NEXT: s_waitcnt lgkmcnt(0) 680; GCN-NEXT: v_mov_b32_e32 v0, s2 681; GCN-NEXT: s_mov_b32 m0, -1 682; GCN-NEXT: ds_read_u16 v0, v0 683; GCN-NEXT: s_mov_b32 s3, 0xf000 684; GCN-NEXT: s_mov_b32 s2, -1 685; GCN-NEXT: s_waitcnt lgkmcnt(0) 686; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 687; GCN-NEXT: s_endpgm 688; 689; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: 690; GFX9: ; %bb.0: ; %entry 691; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 692; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 693; GFX9-NEXT: v_mov_b32_e32 v1, 0 694; GFX9-NEXT: s_waitcnt lgkmcnt(0) 695; GFX9-NEXT: v_mov_b32_e32 v0, s2 696; GFX9-NEXT: ds_read_u16_d16 v0, v0 697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 698; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 699; GFX9-NEXT: s_endpgm 700; 701; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: 702; GFX906: ; %bb.0: ; %entry 703; GFX906-NEXT: s_load_dword s2, s[4:5], 0x24 704; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 705; GFX906-NEXT: v_mov_b32_e32 v1, 0 706; GFX906-NEXT: s_waitcnt lgkmcnt(0) 707; GFX906-NEXT: v_mov_b32_e32 v0, s2 708; GFX906-NEXT: ds_read_u16 v0, v0 709; GFX906-NEXT: s_waitcnt lgkmcnt(0) 710; GFX906-NEXT: global_store_dword v1, v0, s[0:1] 711; GFX906-NEXT: s_endpgm 712; 713; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: 714; GFX11: ; %bb.0: ; %entry 715; GFX11-NEXT: s_clause 0x1 716; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 717; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c 718; GFX11-NEXT: s_waitcnt lgkmcnt(0) 719; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 720; GFX11-NEXT: ds_load_u16_d16 v0, v0 721; GFX11-NEXT: s_waitcnt lgkmcnt(0) 722; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 723; GFX11-NEXT: s_endpgm 724entry: 725 %load = load i16, ptr addrspace(3) %in 726 %build = insertelement <2 x i16> undef, i16 %load, i32 0 727 %result = bitcast <2 x i16> %build to i32 728 store i32 %result, ptr addrspace(1) %out 729 ret void 730} 731