1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s 5 6 7define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { 8; GFX9-LABEL: s_pack_v2f16: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 11; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 13; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 16; GFX9-NEXT: ;;#ASMSTART 17; GFX9-NEXT: ; use s0 18; GFX9-NEXT: ;;#ASMEND 19; GFX9-NEXT: s_endpgm 20; 21; GFX8-LABEL: s_pack_v2f16: 22; GFX8: ; %bb.0: 23; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 24; GFX8-NEXT: s_waitcnt lgkmcnt(0) 25; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 26; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 27; GFX8-NEXT: s_waitcnt lgkmcnt(0) 28; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 29; GFX8-NEXT: s_lshl_b32 s1, s1, 16 30; GFX8-NEXT: s_or_b32 s0, s0, s1 31; GFX8-NEXT: ;;#ASMSTART 32; GFX8-NEXT: ; use s0 33; GFX8-NEXT: ;;#ASMEND 34; GFX8-NEXT: s_endpgm 35; 36; GFX7-LABEL: s_pack_v2f16: 37; GFX7: ; %bb.0: 38; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 39; GFX7-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 41; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 42; GFX7-NEXT: s_waitcnt lgkmcnt(0) 43; GFX7-NEXT: s_and_b32 s0, s0, 0xffff 44; GFX7-NEXT: s_lshl_b32 s1, s1, 16 45; GFX7-NEXT: s_or_b32 s0, s0, s1 46; GFX7-NEXT: ;;#ASMSTART 47; GFX7-NEXT: ; use s0 48; GFX7-NEXT: ;;#ASMEND 49; GFX7-NEXT: s_endpgm 50 %val0 = load volatile i32, ptr addrspace(4) %in0 51 %val1 = load volatile i32, ptr addrspace(4) %in1 52 %lo.i = trunc i32 %val0 to i16 53 %hi.i = trunc i32 %val1 to i16 54 %lo = bitcast i16 %lo.i to half 55 %hi = bitcast i16 %hi.i to half 56 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 57 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 58 %vec.i32 = bitcast <2 x half> %vec.1 to i32 59 60 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 61 ret void 62} 63 64define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { 65; GFX9-LABEL: s_pack_v2f16_imm_lo: 66; GFX9: ; %bb.0: 67; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 68; GFX9-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 70; GFX9-NEXT: s_waitcnt lgkmcnt(0) 71; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x1234, s0 72; GFX9-NEXT: ;;#ASMSTART 73; GFX9-NEXT: ; use s0 74; GFX9-NEXT: ;;#ASMEND 75; GFX9-NEXT: s_endpgm 76; 77; GFX8-LABEL: s_pack_v2f16_imm_lo: 78; GFX8: ; %bb.0: 79; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 80; GFX8-NEXT: s_waitcnt lgkmcnt(0) 81; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 82; GFX8-NEXT: s_waitcnt lgkmcnt(0) 83; GFX8-NEXT: s_lshl_b32 s0, s0, 16 84; GFX8-NEXT: s_or_b32 s0, s0, 0x1234 85; GFX8-NEXT: ;;#ASMSTART 86; GFX8-NEXT: ; use s0 87; GFX8-NEXT: ;;#ASMEND 88; GFX8-NEXT: s_endpgm 89; 90; GFX7-LABEL: s_pack_v2f16_imm_lo: 91; GFX7: ; %bb.0: 92; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 93; GFX7-NEXT: s_waitcnt lgkmcnt(0) 94; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 95; GFX7-NEXT: s_waitcnt lgkmcnt(0) 96; GFX7-NEXT: s_lshl_b32 s0, s0, 16 97; GFX7-NEXT: s_or_b32 s0, s0, 0x1234 98; GFX7-NEXT: ;;#ASMSTART 99; GFX7-NEXT: ; use s0 100; GFX7-NEXT: ;;#ASMEND 101; GFX7-NEXT: s_endpgm 102 %val1 = load i32, ptr addrspace(4) %in1 103 %hi.i = trunc i32 %val1 to i16 104 %hi = bitcast i16 %hi.i to half 105 %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 106 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 107 %vec.i32 = bitcast <2 x half> %vec.1 to i32 108 109 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 110 ret void 111} 112 113define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { 114; GFX9-LABEL: s_pack_v2f16_imm_hi: 115; GFX9: ; %bb.0: 116; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 117; GFX9-NEXT: s_waitcnt lgkmcnt(0) 118; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 119; GFX9-NEXT: s_waitcnt lgkmcnt(0) 120; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x1234 121; GFX9-NEXT: ;;#ASMSTART 122; GFX9-NEXT: ; use s0 123; GFX9-NEXT: ;;#ASMEND 124; GFX9-NEXT: s_endpgm 125; 126; GFX8-LABEL: s_pack_v2f16_imm_hi: 127; GFX8: ; %bb.0: 128; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 129; GFX8-NEXT: s_waitcnt lgkmcnt(0) 130; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 131; GFX8-NEXT: s_waitcnt lgkmcnt(0) 132; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 133; GFX8-NEXT: s_or_b32 s0, s0, 0x12340000 134; GFX8-NEXT: ;;#ASMSTART 135; GFX8-NEXT: ; use s0 136; GFX8-NEXT: ;;#ASMEND 137; GFX8-NEXT: s_endpgm 138; 139; GFX7-LABEL: s_pack_v2f16_imm_hi: 140; GFX7: ; %bb.0: 141; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 142; GFX7-NEXT: s_waitcnt lgkmcnt(0) 143; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 144; GFX7-NEXT: s_waitcnt lgkmcnt(0) 145; GFX7-NEXT: s_and_b32 s0, s0, 0xffff 146; GFX7-NEXT: s_or_b32 s0, s0, 0x12340000 147; GFX7-NEXT: ;;#ASMSTART 148; GFX7-NEXT: ; use s0 149; GFX7-NEXT: ;;#ASMEND 150; GFX7-NEXT: s_endpgm 151 %val0 = load i32, ptr addrspace(4) %in0 152 %lo.i = trunc i32 %val0 to i16 153 %lo = bitcast i16 %lo.i to half 154 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 155 %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1 156 %vec.i32 = bitcast <2 x half> %vec.1 to i32 157 158 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 159 ret void 160} 161 162define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { 163; GFX9-LABEL: v_pack_v2f16: 164; GFX9: ; %bb.0: 165; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 166; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 167; GFX9-NEXT: s_waitcnt lgkmcnt(0) 168; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 169; GFX9-NEXT: s_waitcnt vmcnt(0) 170; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 171; GFX9-NEXT: s_waitcnt vmcnt(0) 172; GFX9-NEXT: s_mov_b32 s0, 0x5040100 173; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 174; GFX9-NEXT: ;;#ASMSTART 175; GFX9-NEXT: ; use v0 176; GFX9-NEXT: ;;#ASMEND 177; GFX9-NEXT: s_endpgm 178; 179; GFX8-LABEL: v_pack_v2f16: 180; GFX8: ; %bb.0: 181; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 182; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 183; GFX8-NEXT: s_waitcnt lgkmcnt(0) 184; GFX8-NEXT: v_mov_b32_e32 v1, s1 185; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 186; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 187; GFX8-NEXT: v_mov_b32_e32 v3, s3 188; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 189; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 190; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 191; GFX8-NEXT: s_waitcnt vmcnt(0) 192; GFX8-NEXT: flat_load_dword v1, v[2:3] glc 193; GFX8-NEXT: s_waitcnt vmcnt(0) 194; GFX8-NEXT: s_mov_b32 s0, 0x1000504 195; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 196; GFX8-NEXT: ;;#ASMSTART 197; GFX8-NEXT: ; use v0 198; GFX8-NEXT: ;;#ASMEND 199; GFX8-NEXT: s_endpgm 200; 201; GFX7-LABEL: v_pack_v2f16: 202; GFX7: ; %bb.0: 203; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 204; GFX7-NEXT: s_mov_b32 s7, 0x100f000 205; GFX7-NEXT: s_mov_b32 s6, 0 206; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 207; GFX7-NEXT: v_mov_b32_e32 v1, 0 208; GFX7-NEXT: s_waitcnt lgkmcnt(0) 209; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1] 210; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 211; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 212; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 213; GFX7-NEXT: s_waitcnt vmcnt(0) 214; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 215; GFX7-NEXT: s_waitcnt vmcnt(0) 216; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 217; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 218; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 219; GFX7-NEXT: ;;#ASMSTART 220; GFX7-NEXT: ; use v0 221; GFX7-NEXT: ;;#ASMEND 222; GFX7-NEXT: s_endpgm 223 %tid = call i32 @llvm.amdgcn.workitem.id.x() 224 %tid.ext = sext i32 %tid to i64 225 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 226 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 227 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 228 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 229 %lo.i = trunc i32 %val0 to i16 230 %hi.i = trunc i32 %val1 to i16 231 %lo = bitcast i16 %lo.i to half 232 %hi = bitcast i16 %hi.i to half 233 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 234 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 235 %vec.i32 = bitcast <2 x half> %vec.1 to i32 236 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 237 ret void 238} 239 240define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { 241; GFX9-LABEL: v_pack_v2f16_user: 242; GFX9: ; %bb.0: 243; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 244; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 246; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc 247; GFX9-NEXT: s_waitcnt vmcnt(0) 248; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 249; GFX9-NEXT: s_waitcnt vmcnt(0) 250; GFX9-NEXT: s_mov_b32 s0, 0x5040100 251; GFX9-NEXT: s_mov_b32 s3, 0xf000 252; GFX9-NEXT: s_mov_b32 s2, -1 253; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 254; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 255; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 256; GFX9-NEXT: s_waitcnt vmcnt(0) 257; GFX9-NEXT: s_endpgm 258; 259; GFX8-LABEL: v_pack_v2f16_user: 260; GFX8: ; %bb.0: 261; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 262; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 263; GFX8-NEXT: s_waitcnt lgkmcnt(0) 264; GFX8-NEXT: v_mov_b32_e32 v1, s1 265; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 266; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 267; GFX8-NEXT: v_mov_b32_e32 v3, s3 268; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 269; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 270; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 271; GFX8-NEXT: s_waitcnt vmcnt(0) 272; GFX8-NEXT: flat_load_dword v1, v[2:3] glc 273; GFX8-NEXT: s_waitcnt vmcnt(0) 274; GFX8-NEXT: s_mov_b32 s0, 0x1000504 275; GFX8-NEXT: s_mov_b32 s3, 0x1100f000 276; GFX8-NEXT: s_mov_b32 s2, -1 277; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 278; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0 279; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 280; GFX8-NEXT: s_waitcnt vmcnt(0) 281; GFX8-NEXT: s_endpgm 282; 283; GFX7-LABEL: v_pack_v2f16_user: 284; GFX7: ; %bb.0: 285; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 286; GFX7-NEXT: s_mov_b32 s6, 0 287; GFX7-NEXT: s_mov_b32 s7, 0x100f000 288; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 289; GFX7-NEXT: v_mov_b32_e32 v1, 0 290; GFX7-NEXT: s_waitcnt lgkmcnt(0) 291; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1] 292; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 293; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 294; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 295; GFX7-NEXT: s_waitcnt vmcnt(0) 296; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 297; GFX7-NEXT: s_waitcnt vmcnt(0) 298; GFX7-NEXT: s_mov_b32 s6, -1 299; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 300; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 301; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 302; GFX7-NEXT: v_add_i32_e32 v0, vcc, 9, v0 303; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 304; GFX7-NEXT: s_waitcnt vmcnt(0) 305; GFX7-NEXT: s_endpgm 306 %tid = call i32 @llvm.amdgcn.workitem.id.x() 307 %tid.ext = sext i32 %tid to i64 308 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 309 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 310 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 311 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 312 %lo.i = trunc i32 %val0 to i16 313 %hi.i = trunc i32 %val1 to i16 314 %lo = bitcast i16 %lo.i to half 315 %hi = bitcast i16 %hi.i to half 316 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 317 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 318 %vec.i32 = bitcast <2 x half> %vec.1 to i32 319 %foo = add i32 %vec.i32, 9 320 store volatile i32 %foo, ptr addrspace(1) undef 321 ret void 322} 323 324define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { 325; GFX9-LABEL: v_pack_v2f16_imm_lo: 326; GFX9: ; %bb.0: 327; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 328; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 329; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 331; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 332; GFX9-NEXT: s_waitcnt vmcnt(0) 333; GFX9-NEXT: s_movk_i32 s0, 0x1234 334; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 335; GFX9-NEXT: ;;#ASMSTART 336; GFX9-NEXT: ; use v0 337; GFX9-NEXT: ;;#ASMEND 338; GFX9-NEXT: s_endpgm 339; 340; GFX8-LABEL: v_pack_v2f16_imm_lo: 341; GFX8: ; %bb.0: 342; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 343; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 344; GFX8-NEXT: s_waitcnt lgkmcnt(0) 345; GFX8-NEXT: v_mov_b32_e32 v1, s1 346; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 347; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 348; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 349; GFX8-NEXT: s_waitcnt vmcnt(0) 350; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 351; GFX8-NEXT: v_or_b32_e32 v0, 0x1234, v0 352; GFX8-NEXT: ;;#ASMSTART 353; GFX8-NEXT: ; use v0 354; GFX8-NEXT: ;;#ASMEND 355; GFX8-NEXT: s_endpgm 356; 357; GFX7-LABEL: v_pack_v2f16_imm_lo: 358; GFX7: ; %bb.0: 359; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 360; GFX7-NEXT: s_mov_b32 s3, 0x100f000 361; GFX7-NEXT: s_mov_b32 s2, 0 362; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 363; GFX7-NEXT: v_mov_b32_e32 v1, 0 364; GFX7-NEXT: s_waitcnt lgkmcnt(0) 365; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 366; GFX7-NEXT: s_waitcnt vmcnt(0) 367; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 368; GFX7-NEXT: v_or_b32_e32 v0, 0x1234, v0 369; GFX7-NEXT: ;;#ASMSTART 370; GFX7-NEXT: ; use v0 371; GFX7-NEXT: ;;#ASMEND 372; GFX7-NEXT: s_endpgm 373 %tid = call i32 @llvm.amdgcn.workitem.id.x() 374 %tid.ext = sext i32 %tid to i64 375 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 376 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 377 %hi.i = trunc i32 %val1 to i16 378 %hi = bitcast i16 %hi.i to half 379 %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 380 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 381 %vec.i32 = bitcast <2 x half> %vec.1 to i32 382 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 383 ret void 384} 385 386define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 { 387; GFX9-LABEL: v_pack_v2f16_inline_imm_lo: 388; GFX9: ; %bb.0: 389; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 390; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 391; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 393; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 394; GFX9-NEXT: s_waitcnt vmcnt(0) 395; GFX9-NEXT: s_movk_i32 s0, 0x4400 396; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 397; GFX9-NEXT: ;;#ASMSTART 398; GFX9-NEXT: ; use v0 399; GFX9-NEXT: ;;#ASMEND 400; GFX9-NEXT: s_endpgm 401; 402; GFX8-LABEL: v_pack_v2f16_inline_imm_lo: 403; GFX8: ; %bb.0: 404; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 405; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 406; GFX8-NEXT: s_waitcnt lgkmcnt(0) 407; GFX8-NEXT: v_mov_b32_e32 v1, s1 408; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 409; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 410; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 411; GFX8-NEXT: s_waitcnt vmcnt(0) 412; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 413; GFX8-NEXT: v_or_b32_e32 v0, 0x4400, v0 414; GFX8-NEXT: ;;#ASMSTART 415; GFX8-NEXT: ; use v0 416; GFX8-NEXT: ;;#ASMEND 417; GFX8-NEXT: s_endpgm 418; 419; GFX7-LABEL: v_pack_v2f16_inline_imm_lo: 420; GFX7: ; %bb.0: 421; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 422; GFX7-NEXT: s_mov_b32 s3, 0x100f000 423; GFX7-NEXT: s_mov_b32 s2, 0 424; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 425; GFX7-NEXT: v_mov_b32_e32 v1, 0 426; GFX7-NEXT: s_waitcnt lgkmcnt(0) 427; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 428; GFX7-NEXT: s_waitcnt vmcnt(0) 429; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 430; GFX7-NEXT: v_or_b32_e32 v0, 0x4400, v0 431; GFX7-NEXT: ;;#ASMSTART 432; GFX7-NEXT: ; use v0 433; GFX7-NEXT: ;;#ASMEND 434; GFX7-NEXT: s_endpgm 435 %tid = call i32 @llvm.amdgcn.workitem.id.x() 436 %tid.ext = sext i32 %tid to i64 437 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext 438 %val1 = load volatile i32, ptr addrspace(1) %in1.gep 439 %hi.i = trunc i32 %val1 to i16 440 %hi = bitcast i16 %hi.i to half 441 %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0 442 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 443 %vec.i32 = bitcast <2 x half> %vec.1 to i32 444 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 445 ret void 446} 447 448define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { 449; GFX9-LABEL: v_pack_v2f16_imm_hi: 450; GFX9: ; %bb.0: 451; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 452; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 454; GFX9-NEXT: s_waitcnt lgkmcnt(0) 455; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 456; GFX9-NEXT: s_waitcnt vmcnt(0) 457; GFX9-NEXT: s_movk_i32 s0, 0x1234 458; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 459; GFX9-NEXT: ;;#ASMSTART 460; GFX9-NEXT: ; use v0 461; GFX9-NEXT: ;;#ASMEND 462; GFX9-NEXT: s_endpgm 463; 464; GFX8-LABEL: v_pack_v2f16_imm_hi: 465; GFX8: ; %bb.0: 466; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 467; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 468; GFX8-NEXT: s_waitcnt lgkmcnt(0) 469; GFX8-NEXT: v_mov_b32_e32 v1, s1 470; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 471; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 472; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 473; GFX8-NEXT: s_waitcnt vmcnt(0) 474; GFX8-NEXT: v_mov_b32_e32 v1, 0x12340000 475; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 476; GFX8-NEXT: ;;#ASMSTART 477; GFX8-NEXT: ; use v0 478; GFX8-NEXT: ;;#ASMEND 479; GFX8-NEXT: s_endpgm 480; 481; GFX7-LABEL: v_pack_v2f16_imm_hi: 482; GFX7: ; %bb.0: 483; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 484; GFX7-NEXT: s_mov_b32 s3, 0x100f000 485; GFX7-NEXT: s_mov_b32 s2, 0 486; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 487; GFX7-NEXT: v_mov_b32_e32 v1, 0 488; GFX7-NEXT: s_waitcnt lgkmcnt(0) 489; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 490; GFX7-NEXT: s_waitcnt vmcnt(0) 491; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 492; GFX7-NEXT: v_or_b32_e32 v0, 0x12340000, v0 493; GFX7-NEXT: ;;#ASMSTART 494; GFX7-NEXT: ; use v0 495; GFX7-NEXT: ;;#ASMEND 496; GFX7-NEXT: s_endpgm 497 %tid = call i32 @llvm.amdgcn.workitem.id.x() 498 %tid.ext = sext i32 %tid to i64 499 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 500 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 501 %lo.i = trunc i32 %val0 to i16 502 %lo = bitcast i16 %lo.i to half 503 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 504 %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1 505 %vec.i32 = bitcast <2 x half> %vec.1 to i32 506 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 507 ret void 508} 509 510define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 { 511; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi: 512; GFX9: ; %bb.0: 513; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 514; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 515; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 517; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 518; GFX9-NEXT: s_waitcnt vmcnt(0) 519; GFX9-NEXT: s_movk_i32 s0, 0x3c00 520; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 521; GFX9-NEXT: ;;#ASMSTART 522; GFX9-NEXT: ; use v0 523; GFX9-NEXT: ;;#ASMEND 524; GFX9-NEXT: s_endpgm 525; 526; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi: 527; GFX8: ; %bb.0: 528; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 529; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 530; GFX8-NEXT: s_waitcnt lgkmcnt(0) 531; GFX8-NEXT: v_mov_b32_e32 v1, s1 532; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 533; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 534; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 535; GFX8-NEXT: s_waitcnt vmcnt(0) 536; GFX8-NEXT: v_bfrev_b32_e32 v1, 60 537; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 538; GFX8-NEXT: ;;#ASMSTART 539; GFX8-NEXT: ; use v0 540; GFX8-NEXT: ;;#ASMEND 541; GFX8-NEXT: s_endpgm 542; 543; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi: 544; GFX7: ; %bb.0: 545; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 546; GFX7-NEXT: s_mov_b32 s3, 0x100f000 547; GFX7-NEXT: s_mov_b32 s2, 0 548; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 549; GFX7-NEXT: v_mov_b32_e32 v1, 0 550; GFX7-NEXT: s_waitcnt lgkmcnt(0) 551; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 552; GFX7-NEXT: s_waitcnt vmcnt(0) 553; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 554; GFX7-NEXT: v_or_b32_e32 v0, 0x3c000000, v0 555; GFX7-NEXT: ;;#ASMSTART 556; GFX7-NEXT: ; use v0 557; GFX7-NEXT: ;;#ASMEND 558; GFX7-NEXT: s_endpgm 559 %tid = call i32 @llvm.amdgcn.workitem.id.x() 560 %tid.ext = sext i32 %tid to i64 561 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 562 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 563 %lo.i = trunc i32 %val0 to i16 564 %lo = bitcast i16 %lo.i to half 565 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 566 %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1 567 %vec.i32 = bitcast <2 x half> %vec.1 to i32 568 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 569 ret void 570} 571 572define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 { 573; GFX9-LABEL: v_pack_v2f16_inline_imm_hi: 574; GFX9: ; %bb.0: 575; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 576; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 577; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 578; GFX9-NEXT: s_waitcnt lgkmcnt(0) 579; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc 580; GFX9-NEXT: s_waitcnt vmcnt(0) 581; GFX9-NEXT: v_perm_b32 v0, 64, v0, v1 582; GFX9-NEXT: ;;#ASMSTART 583; GFX9-NEXT: ; use v0 584; GFX9-NEXT: ;;#ASMEND 585; GFX9-NEXT: s_endpgm 586; 587; GFX8-LABEL: v_pack_v2f16_inline_imm_hi: 588; GFX8: ; %bb.0: 589; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 590; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 591; GFX8-NEXT: s_waitcnt lgkmcnt(0) 592; GFX8-NEXT: v_mov_b32_e32 v1, s1 593; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 594; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 595; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 596; GFX8-NEXT: s_waitcnt vmcnt(0) 597; GFX8-NEXT: v_mov_b32_e32 v1, 0x400000 598; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 599; GFX8-NEXT: ;;#ASMSTART 600; GFX8-NEXT: ; use v0 601; GFX8-NEXT: ;;#ASMEND 602; GFX8-NEXT: s_endpgm 603; 604; GFX7-LABEL: v_pack_v2f16_inline_imm_hi: 605; GFX7: ; %bb.0: 606; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 607; GFX7-NEXT: s_mov_b32 s3, 0x100f000 608; GFX7-NEXT: s_mov_b32 s2, 0 609; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 610; GFX7-NEXT: v_mov_b32_e32 v1, 0 611; GFX7-NEXT: s_waitcnt lgkmcnt(0) 612; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc 613; GFX7-NEXT: s_waitcnt vmcnt(0) 614; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 615; GFX7-NEXT: v_or_b32_e32 v0, 0x400000, v0 616; GFX7-NEXT: ;;#ASMSTART 617; GFX7-NEXT: ; use v0 618; GFX7-NEXT: ;;#ASMEND 619; GFX7-NEXT: s_endpgm 620 %tid = call i32 @llvm.amdgcn.workitem.id.x() 621 %tid.ext = sext i32 %tid to i64 622 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext 623 %val0 = load volatile i32, ptr addrspace(1) %in0.gep 624 %lo.i = trunc i32 %val0 to i16 625 %lo = bitcast i16 %lo.i to half 626 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 627 %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1 628 %vec.i32 = bitcast <2 x half> %vec.1 to i32 629 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 630 ret void 631} 632 633declare i32 @llvm.amdgcn.workitem.id.x() #1 634 635attributes #0 = { nounwind } 636attributes #1 = { nounwind readnone } 637