1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s 7 8declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone 9 10define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 11; GFX6-LABEL: test_convert_fp32_to_fp16: 12; GFX6: ; %bb.0: 13; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 14; GFX6-NEXT: s_mov_b32 s7, 0xf000 15; GFX6-NEXT: s_mov_b32 s6, -1 16; GFX6-NEXT: s_mov_b32 s10, s6 17; GFX6-NEXT: s_mov_b32 s11, s7 18; GFX6-NEXT: s_waitcnt lgkmcnt(0) 19; GFX6-NEXT: s_mov_b32 s8, s2 20; GFX6-NEXT: s_mov_b32 s9, s3 21; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 22; GFX6-NEXT: s_mov_b32 s4, s0 23; GFX6-NEXT: s_mov_b32 s5, s1 24; GFX6-NEXT: s_waitcnt vmcnt(0) 25; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 26; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 27; GFX6-NEXT: s_endpgm 28; 29; GFX8-LABEL: test_convert_fp32_to_fp16: 30; GFX8: ; %bb.0: 31; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 32; GFX8-NEXT: s_mov_b32 s7, 0xf000 33; GFX8-NEXT: s_mov_b32 s6, -1 34; GFX8-NEXT: s_mov_b32 s10, s6 35; GFX8-NEXT: s_mov_b32 s11, s7 36; GFX8-NEXT: s_waitcnt lgkmcnt(0) 37; GFX8-NEXT: s_mov_b32 s8, s2 38; GFX8-NEXT: s_mov_b32 s9, s3 39; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 40; GFX8-NEXT: s_mov_b32 s4, s0 41; GFX8-NEXT: s_mov_b32 s5, s1 42; GFX8-NEXT: s_waitcnt vmcnt(0) 43; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 44; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0 45; GFX8-NEXT: s_endpgm 46; 47; GFX11-TRUE16-LABEL: test_convert_fp32_to_fp16: 48; GFX11-TRUE16: ; %bb.0: 49; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 50; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 51; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 52; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 53; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 54; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 55; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 56; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 57; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 58; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 59; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 60; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 61; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 62; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 63; GFX11-TRUE16-NEXT: s_endpgm 64; 65; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16: 66; GFX11-FAKE16: ; %bb.0: 67; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 68; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 69; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 70; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 71; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 72; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 73; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 74; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 75; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 76; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 77; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 78; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 79; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 80; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 81; GFX11-FAKE16-NEXT: s_endpgm 82; 83; CYPRESS-LABEL: test_convert_fp32_to_fp16: 84; CYPRESS: ; %bb.0: 85; CYPRESS-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 86; CYPRESS-NEXT: TEX 0 @6 87; CYPRESS-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 88; CYPRESS-NEXT: MEM_RAT MSKOR T0.XW, T1.X 89; CYPRESS-NEXT: CF_END 90; CYPRESS-NEXT: PAD 91; CYPRESS-NEXT: Fetch clause starting at 6: 92; CYPRESS-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 93; CYPRESS-NEXT: ALU clause starting at 8: 94; CYPRESS-NEXT: MOV * T0.X, KC0[2].Z, 95; CYPRESS-NEXT: ALU clause starting at 9: 96; CYPRESS-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 97; CYPRESS-NEXT: 3(4.203895e-45), 0(0.000000e+00) 98; CYPRESS-NEXT: FLT32_TO_FLT16 T1.W, T0.X, 99; CYPRESS-NEXT: LSHL * T0.W, PV.W, literal.x, 100; CYPRESS-NEXT: 3(4.203895e-45), 0(0.000000e+00) 101; CYPRESS-NEXT: LSHL T0.X, PV.W, PS, 102; CYPRESS-NEXT: LSHL * T0.W, literal.x, PS, 103; CYPRESS-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 104; CYPRESS-NEXT: MOV T0.Y, 0.0, 105; CYPRESS-NEXT: MOV * T0.Z, 0.0, 106; CYPRESS-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 107; CYPRESS-NEXT: 2(2.802597e-45), 0(0.000000e+00) 108 %val = load float, ptr addrspace(1) %in, align 4 109 %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone 110 store i16 %cvt, ptr addrspace(1) %out, align 2 111 ret void 112} 113