1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX803 %s 4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s 5 6 7define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { 8; GFX9-LABEL: s_pack_v2i16: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 11; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 13; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 16; GFX9-NEXT: ;;#ASMSTART 17; GFX9-NEXT: ; use s0 18; GFX9-NEXT: ;;#ASMEND 19; GFX9-NEXT: s_endpgm 20; 21; GFX803-LABEL: s_pack_v2i16: 22; GFX803: ; %bb.0: 23; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 24; GFX803-NEXT: s_waitcnt lgkmcnt(0) 25; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 26; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0 27; GFX803-NEXT: s_waitcnt lgkmcnt(0) 28; GFX803-NEXT: s_and_b32 s0, s0, 0xffff 29; GFX803-NEXT: s_lshl_b32 s1, s1, 16 30; GFX803-NEXT: s_or_b32 s0, s0, s1 31; GFX803-NEXT: ;;#ASMSTART 32; GFX803-NEXT: ; use s0 33; GFX803-NEXT: ;;#ASMEND 34; GFX803-NEXT: s_endpgm 35; 36; GFX7-LABEL: s_pack_v2i16: 37; GFX7: ; %bb.0: 38; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 39; GFX7-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 41; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 42; GFX7-NEXT: s_waitcnt lgkmcnt(0) 43; GFX7-NEXT: s_and_b32 s0, s0, 0xffff 44; GFX7-NEXT: s_lshl_b32 s1, s1, 16 45; GFX7-NEXT: s_or_b32 s0, s0, s1 46; GFX7-NEXT: ;;#ASMSTART 47; GFX7-NEXT: ; use s0 48; GFX7-NEXT: ;;#ASMEND 49; GFX7-NEXT: s_endpgm 50 %val0 = load volatile i32, ptr addrspace(4) %in0 51 %val1 = load volatile i32, ptr addrspace(4) %in1 52 %lo = trunc i32 %val0 to i16 53 %hi = trunc i32 %val1 to i16 54 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 55 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 56 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 57 58 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 59 ret void 60} 61 62define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { 63; GFX9-LABEL: s_pack_v2i16_imm_lo: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 66; GFX9-NEXT: s_waitcnt lgkmcnt(0) 67; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 68; GFX9-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x1c8, s0 70; GFX9-NEXT: ;;#ASMSTART 71; GFX9-NEXT: ; use s0 72; GFX9-NEXT: ;;#ASMEND 73; GFX9-NEXT: s_endpgm 74; 75; GFX803-LABEL: s_pack_v2i16_imm_lo: 76; GFX803: ; %bb.0: 77; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 78; GFX803-NEXT: s_waitcnt lgkmcnt(0) 79; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 80; GFX803-NEXT: s_waitcnt lgkmcnt(0) 81; GFX803-NEXT: s_lshl_b32 s0, s0, 16 82; GFX803-NEXT: s_or_b32 s0, s0, 0x1c8 83; GFX803-NEXT: ;;#ASMSTART 84; GFX803-NEXT: ; use s0 85; GFX803-NEXT: ;;#ASMEND 86; GFX803-NEXT: s_endpgm 87; 88; GFX7-LABEL: s_pack_v2i16_imm_lo: 89; GFX7: ; %bb.0: 90; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 91; GFX7-NEXT: s_waitcnt lgkmcnt(0) 92; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 93; GFX7-NEXT: s_waitcnt lgkmcnt(0) 94; GFX7-NEXT: s_lshl_b32 s0, s0, 16 95; GFX7-NEXT: s_or_b32 s0, s0, 0x1c8 96; GFX7-NEXT: ;;#ASMSTART 97; GFX7-NEXT: ; use s0 98; GFX7-NEXT: ;;#ASMEND 99; GFX7-NEXT: s_endpgm 100 %val1 = load i32, ptr addrspace(4) %in1 101 %hi = trunc i32 %val1 to i16 102 %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0 103 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 104 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 105 106 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 107 ret void 108} 109 110define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { 111; GFX9-LABEL: s_pack_v2i16_imm_hi: 112; GFX9: ; %bb.0: 113; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 114; GFX9-NEXT: s_waitcnt lgkmcnt(0) 115; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 116; GFX9-NEXT: s_waitcnt lgkmcnt(0) 117; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x1c8 118; GFX9-NEXT: ;;#ASMSTART 119; GFX9-NEXT: ; use s0 120; GFX9-NEXT: ;;#ASMEND 121; GFX9-NEXT: s_endpgm 122; 123; GFX803-LABEL: s_pack_v2i16_imm_hi: 124; GFX803: ; %bb.0: 125; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 126; GFX803-NEXT: s_waitcnt lgkmcnt(0) 127; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 128; GFX803-NEXT: s_waitcnt lgkmcnt(0) 129; GFX803-NEXT: s_and_b32 s0, s0, 0xffff 130; GFX803-NEXT: s_or_b32 s0, s0, 0x1c80000 131; GFX803-NEXT: ;;#ASMSTART 132; GFX803-NEXT: ; use s0 133; GFX803-NEXT: ;;#ASMEND 134; GFX803-NEXT: s_endpgm 135; 136; GFX7-LABEL: s_pack_v2i16_imm_hi: 137; GFX7: ; %bb.0: 138; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 139; GFX7-NEXT: s_waitcnt lgkmcnt(0) 140; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 141; GFX7-NEXT: s_waitcnt lgkmcnt(0) 142; GFX7-NEXT: s_and_b32 s0, s0, 0xffff 143; GFX7-NEXT: s_or_b32 s0, s0, 0x1c80000 144; GFX7-NEXT: ;;#ASMSTART 145; GFX7-NEXT: ; use s0 146; GFX7-NEXT: ;;#ASMEND 147; GFX7-NEXT: s_endpgm 148 %val0 = load i32, ptr addrspace(4) %in0 149 %lo = trunc i32 %val0 to i16 150 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 151 %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1 152 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 153 154 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 155 ret void 156} 157 158define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { 159; GFX9-LABEL: v_pack_v2i16: 160; GFX9: ; %bb.0: 161; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 162; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 163; GFX9-NEXT: s_waitcnt lgkmcnt(0) 164; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 165; GFX9-NEXT: s_waitcnt vmcnt(0) 166; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 167; GFX9-NEXT: s_waitcnt vmcnt(0) 168; GFX9-NEXT: s_mov_b32 s0, 0x5040100 169; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 170; GFX9-NEXT: ;;#ASMSTART 171; GFX9-NEXT: ; use v0 172; GFX9-NEXT: ;;#ASMEND 173; GFX9-NEXT: s_endpgm 174; 175; GFX803-LABEL: v_pack_v2i16: 176; GFX803: ; %bb.0: 177; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 178; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 179; GFX803-NEXT: s_waitcnt lgkmcnt(0) 180; GFX803-NEXT: v_mov_b32_e32 v1, s1 181; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 182; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 183; GFX803-NEXT: v_mov_b32_e32 v3, s3 184; GFX803-NEXT: v_add_u32_e32 v2, vcc, s2, v2 185; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 186; GFX803-NEXT: flat_load_dword v0, v[0:1] glc 187; GFX803-NEXT: s_waitcnt vmcnt(0) 188; GFX803-NEXT: flat_load_dword v1, v[2:3] glc 189; GFX803-NEXT: s_waitcnt vmcnt(0) 190; GFX803-NEXT: s_mov_b32 s0, 0x1000504 191; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 192; GFX803-NEXT: ;;#ASMSTART 193; GFX803-NEXT: ; use v0 194; GFX803-NEXT: ;;#ASMEND 195; GFX803-NEXT: s_endpgm 196; 197; GFX7-LABEL: v_pack_v2i16: 198; GFX7: ; %bb.0: 199; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 200; GFX7-NEXT: s_mov_b32 s7, 0x100f000 201; GFX7-NEXT: s_mov_b32 s6, 0 202; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 203; GFX7-NEXT: v_mov_b32_e32 v1, 0 204; GFX7-NEXT: s_waitcnt lgkmcnt(0) 205; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1] 206; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 207; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 208; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 209; GFX7-NEXT: s_waitcnt vmcnt(0) 210; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 211; GFX7-NEXT: s_waitcnt vmcnt(0) 212; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 213; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 214; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 215; GFX7-NEXT: ;;#ASMSTART 216; GFX7-NEXT: ; use v0 217; GFX7-NEXT: ;;#ASMEND 218; GFX7-NEXT: s_endpgm 219 %tid = call i32 @llvm.amdgcn.workitem.id.x() 220 %tid.ext = sext i32 %tid to i64 221 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 222 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 223 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 224 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 225 %lo = trunc i32 %val0 to i16 226 %hi = trunc i32 %val1 to i16 227 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 228 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 229 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 230 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 231 ret void 232} 233 234define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { 235; GFX9-LABEL: v_pack_v2i16_user: 236; GFX9: ; %bb.0: 237; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 238; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 241; GFX9-NEXT: s_waitcnt vmcnt(0) 242; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 243; GFX9-NEXT: s_waitcnt vmcnt(0) 244; GFX9-NEXT: s_mov_b32 s0, 0x5040100 245; GFX9-NEXT: s_mov_b32 s3, 0xf000 246; GFX9-NEXT: s_mov_b32 s2, -1 247; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 248; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 249; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 250; GFX9-NEXT: s_waitcnt vmcnt(0) 251; GFX9-NEXT: s_endpgm 252; 253; GFX803-LABEL: v_pack_v2i16_user: 254; GFX803: ; %bb.0: 255; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 256; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 257; GFX803-NEXT: s_waitcnt lgkmcnt(0) 258; GFX803-NEXT: v_mov_b32_e32 v1, s1 259; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 260; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 261; GFX803-NEXT: v_mov_b32_e32 v3, s3 262; GFX803-NEXT: v_add_u32_e32 v2, vcc, s2, v2 263; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 264; GFX803-NEXT: flat_load_dword v0, v[0:1] glc 265; GFX803-NEXT: s_waitcnt vmcnt(0) 266; GFX803-NEXT: flat_load_dword v1, v[2:3] glc 267; GFX803-NEXT: s_waitcnt vmcnt(0) 268; GFX803-NEXT: s_mov_b32 s0, 0x1000504 269; GFX803-NEXT: s_mov_b32 s3, 0x1100f000 270; GFX803-NEXT: s_mov_b32 s2, -1 271; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 272; GFX803-NEXT: v_add_u32_e32 v0, vcc, 9, v0 273; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 274; GFX803-NEXT: s_waitcnt vmcnt(0) 275; GFX803-NEXT: s_endpgm 276; 277; GFX7-LABEL: v_pack_v2i16_user: 278; GFX7: ; %bb.0: 279; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 280; GFX7-NEXT: s_mov_b32 s6, 0 281; GFX7-NEXT: s_mov_b32 s7, 0x100f000 282; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 283; GFX7-NEXT: v_mov_b32_e32 v1, 0 284; GFX7-NEXT: s_waitcnt lgkmcnt(0) 285; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1] 286; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 287; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 288; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 289; GFX7-NEXT: s_waitcnt vmcnt(0) 290; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 291; GFX7-NEXT: s_waitcnt vmcnt(0) 292; GFX7-NEXT: s_mov_b32 s6, -1 293; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 294; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 295; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 296; GFX7-NEXT: v_add_i32_e32 v0, vcc, 9, v0 297; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX7-NEXT: s_waitcnt vmcnt(0) 299; GFX7-NEXT: s_endpgm 300 %tid = call i32 @llvm.amdgcn.workitem.id.x() 301 %tid.ext = sext i32 %tid to i64 302 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 303 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 304 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 305 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 306 %lo = trunc i32 %val0 to i16 307 %hi = trunc i32 %val1 to i16 308 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 309 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 310 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 311 %foo = add i32 %vec.i32, 9 312 store volatile i32 %foo, ptr addrspace(1) undef 313 ret void 314} 315 316define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { 317; GFX9-LABEL: v_pack_v2i16_imm_lo: 318; GFX9: ; %bb.0: 319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 320; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 321; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 323; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 324; GFX9-NEXT: s_waitcnt vmcnt(0) 325; GFX9-NEXT: s_movk_i32 s0, 0x7b 326; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 327; GFX9-NEXT: ;;#ASMSTART 328; GFX9-NEXT: ; use v0 329; GFX9-NEXT: ;;#ASMEND 330; GFX9-NEXT: s_endpgm 331; 332; GFX803-LABEL: v_pack_v2i16_imm_lo: 333; GFX803: ; %bb.0: 334; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 335; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 336; GFX803-NEXT: s_waitcnt lgkmcnt(0) 337; GFX803-NEXT: v_mov_b32_e32 v1, s1 338; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 339; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 340; GFX803-NEXT: flat_load_dword v0, v[0:1] glc 341; GFX803-NEXT: s_waitcnt vmcnt(0) 342; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 343; GFX803-NEXT: v_or_b32_e32 v0, 0x7b, v0 344; GFX803-NEXT: ;;#ASMSTART 345; GFX803-NEXT: ; use v0 346; GFX803-NEXT: ;;#ASMEND 347; GFX803-NEXT: s_endpgm 348; 349; GFX7-LABEL: v_pack_v2i16_imm_lo: 350; GFX7: ; %bb.0: 351; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 352; GFX7-NEXT: s_mov_b32 s3, 0x100f000 353; GFX7-NEXT: s_mov_b32 s2, 0 354; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 355; GFX7-NEXT: v_mov_b32_e32 v1, 0 356; GFX7-NEXT: s_waitcnt lgkmcnt(0) 357; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 358; GFX7-NEXT: s_waitcnt vmcnt(0) 359; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 360; GFX7-NEXT: v_or_b32_e32 v0, 0x7b, v0 361; GFX7-NEXT: ;;#ASMSTART 362; GFX7-NEXT: ; use v0 363; GFX7-NEXT: ;;#ASMEND 364; GFX7-NEXT: s_endpgm 365 %tid = call i32 @llvm.amdgcn.workitem.id.x() 366 %tid.ext = sext i32 %tid to i64 367 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 368 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 369 %hi = trunc i32 %val1 to i16 370 %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0 371 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 372 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 373 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 374 ret void 375} 376 377define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 { 378; GFX9-LABEL: v_pack_v2i16_inline_imm_lo: 379; GFX9: ; %bb.0: 380; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 381; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 382; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 383; GFX9-NEXT: s_waitcnt lgkmcnt(0) 384; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 385; GFX9-NEXT: s_waitcnt vmcnt(0) 386; GFX9-NEXT: v_perm_b32 v0, v0, 64, v1 387; GFX9-NEXT: ;;#ASMSTART 388; GFX9-NEXT: ; use v0 389; GFX9-NEXT: ;;#ASMEND 390; GFX9-NEXT: s_endpgm 391; 392; GFX803-LABEL: v_pack_v2i16_inline_imm_lo: 393; GFX803: ; %bb.0: 394; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 395; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 396; GFX803-NEXT: s_waitcnt lgkmcnt(0) 397; GFX803-NEXT: v_mov_b32_e32 v1, s1 398; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 399; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 400; GFX803-NEXT: flat_load_dword v0, v[0:1] glc 401; GFX803-NEXT: s_waitcnt vmcnt(0) 402; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 403; GFX803-NEXT: v_or_b32_e32 v0, 64, v0 404; GFX803-NEXT: ;;#ASMSTART 405; GFX803-NEXT: ; use v0 406; GFX803-NEXT: ;;#ASMEND 407; GFX803-NEXT: s_endpgm 408; 409; GFX7-LABEL: v_pack_v2i16_inline_imm_lo: 410; GFX7: ; %bb.0: 411; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 412; GFX7-NEXT: s_mov_b32 s3, 0x100f000 413; GFX7-NEXT: s_mov_b32 s2, 0 414; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 415; GFX7-NEXT: v_mov_b32_e32 v1, 0 416; GFX7-NEXT: s_waitcnt lgkmcnt(0) 417; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 418; GFX7-NEXT: s_waitcnt vmcnt(0) 419; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 420; GFX7-NEXT: v_or_b32_e32 v0, 64, v0 421; GFX7-NEXT: ;;#ASMSTART 422; GFX7-NEXT: ; use v0 423; GFX7-NEXT: ;;#ASMEND 424; GFX7-NEXT: s_endpgm 425 %tid = call i32 @llvm.amdgcn.workitem.id.x() 426 %tid.ext = sext i32 %tid to i64 427 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 428 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 429 %hi = trunc i32 %val1 to i16 430 %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0 431 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 432 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 433 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 434 ret void 435} 436 437define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { 438; GFX9-LABEL: v_pack_v2i16_imm_hi: 439; GFX9: ; %bb.0: 440; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 441; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 442; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 443; GFX9-NEXT: s_waitcnt lgkmcnt(0) 444; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 445; GFX9-NEXT: s_waitcnt vmcnt(0) 446; GFX9-NEXT: s_movk_i32 s0, 0x7b 447; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 448; GFX9-NEXT: ;;#ASMSTART 449; GFX9-NEXT: ; use v0 450; GFX9-NEXT: ;;#ASMEND 451; GFX9-NEXT: s_endpgm 452; 453; GFX803-LABEL: v_pack_v2i16_imm_hi: 454; GFX803: ; %bb.0: 455; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 456; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 457; GFX803-NEXT: s_waitcnt lgkmcnt(0) 458; GFX803-NEXT: v_mov_b32_e32 v1, s1 459; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 460; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 461; GFX803-NEXT: flat_load_dword v0, v[0:1] glc 462; GFX803-NEXT: s_waitcnt vmcnt(0) 463; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b0000 464; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 465; GFX803-NEXT: ;;#ASMSTART 466; GFX803-NEXT: ; use v0 467; GFX803-NEXT: ;;#ASMEND 468; GFX803-NEXT: s_endpgm 469; 470; GFX7-LABEL: v_pack_v2i16_imm_hi: 471; GFX7: ; %bb.0: 472; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 473; GFX7-NEXT: s_mov_b32 s3, 0x100f000 474; GFX7-NEXT: s_mov_b32 s2, 0 475; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 476; GFX7-NEXT: v_mov_b32_e32 v1, 0 477; GFX7-NEXT: s_waitcnt lgkmcnt(0) 478; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 479; GFX7-NEXT: s_waitcnt vmcnt(0) 480; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 481; GFX7-NEXT: v_or_b32_e32 v0, 0x7b0000, v0 482; GFX7-NEXT: ;;#ASMSTART 483; GFX7-NEXT: ; use v0 484; GFX7-NEXT: ;;#ASMEND 485; GFX7-NEXT: s_endpgm 486 %tid = call i32 @llvm.amdgcn.workitem.id.x() 487 %tid.ext = sext i32 %tid to i64 488 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 489 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 490 %lo = trunc i32 %val0 to i16 491 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 492 %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1 493 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 494 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 495 ret void 496} 497 498define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 { 499; GFX9-LABEL: v_pack_v2i16_inline_imm_hi: 500; GFX9: ; %bb.0: 501; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 502; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 503; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 504; GFX9-NEXT: s_waitcnt lgkmcnt(0) 505; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 506; GFX9-NEXT: s_waitcnt vmcnt(0) 507; GFX9-NEXT: v_perm_b32 v0, 7, v0, v1 508; GFX9-NEXT: ;;#ASMSTART 509; GFX9-NEXT: ; use v0 510; GFX9-NEXT: ;;#ASMEND 511; GFX9-NEXT: s_endpgm 512; 513; GFX803-LABEL: v_pack_v2i16_inline_imm_hi: 514; GFX803: ; %bb.0: 515; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 516; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 517; GFX803-NEXT: s_waitcnt lgkmcnt(0) 518; GFX803-NEXT: v_mov_b32_e32 v1, s1 519; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 520; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 521; GFX803-NEXT: flat_load_dword v0, v[0:1] glc 522; GFX803-NEXT: s_waitcnt vmcnt(0) 523; GFX803-NEXT: v_mov_b32_e32 v1, 0x70000 524; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 525; GFX803-NEXT: ;;#ASMSTART 526; GFX803-NEXT: ; use v0 527; GFX803-NEXT: ;;#ASMEND 528; GFX803-NEXT: s_endpgm 529; 530; GFX7-LABEL: v_pack_v2i16_inline_imm_hi: 531; GFX7: ; %bb.0: 532; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 533; GFX7-NEXT: s_mov_b32 s3, 0x100f000 534; GFX7-NEXT: s_mov_b32 s2, 0 535; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 536; GFX7-NEXT: v_mov_b32_e32 v1, 0 537; GFX7-NEXT: s_waitcnt lgkmcnt(0) 538; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 539; GFX7-NEXT: s_waitcnt vmcnt(0) 540; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 541; GFX7-NEXT: v_or_b32_e32 v0, 0x70000, v0 542; GFX7-NEXT: ;;#ASMSTART 543; GFX7-NEXT: ; use v0 544; GFX7-NEXT: ;;#ASMEND 545; GFX7-NEXT: s_endpgm 546 %tid = call i32 @llvm.amdgcn.workitem.id.x() 547 %tid.ext = sext i32 %tid to i64 548 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 549 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 550 %lo = trunc i32 %val0 to i16 551 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 552 %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1 553 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 554 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 555 ret void 556} 557 558declare i32 @llvm.amdgcn.workitem.id.x() #1 559 560attributes #0 = { nounwind } 561attributes #1 = { nounwind readnone } 562 563