1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s 7 8declare half @llvm.rint.f16(half %a) 9declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) 10 11define amdgpu_kernel void @rint_f16( 12; SI-LABEL: rint_f16: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s6, -1 17; SI-NEXT: s_mov_b32 s10, s6 18; SI-NEXT: s_mov_b32 s11, s7 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: s_mov_b32 s8, s2 21; SI-NEXT: s_mov_b32 s9, s3 22; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 23; SI-NEXT: s_mov_b32 s4, s0 24; SI-NEXT: s_mov_b32 s5, s1 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 27; SI-NEXT: v_rndne_f32_e32 v0, v0 28; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 29; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 30; SI-NEXT: s_endpgm 31; 32; GFX89-LABEL: rint_f16: 33; GFX89: ; %bb.0: ; %entry 34; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 35; GFX89-NEXT: s_mov_b32 s7, 0xf000 36; GFX89-NEXT: s_mov_b32 s6, -1 37; GFX89-NEXT: s_mov_b32 s10, s6 38; GFX89-NEXT: s_mov_b32 s11, s7 39; GFX89-NEXT: s_waitcnt lgkmcnt(0) 40; GFX89-NEXT: s_mov_b32 s8, s2 41; GFX89-NEXT: s_mov_b32 s9, s3 42; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 43; GFX89-NEXT: s_mov_b32 s4, s0 44; GFX89-NEXT: s_mov_b32 s5, s1 45; GFX89-NEXT: s_waitcnt vmcnt(0) 46; GFX89-NEXT: v_rndne_f16_e32 v0, v0 47; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 48; GFX89-NEXT: s_endpgm 49; 50; GFX11-LABEL: rint_f16: 51; GFX11: ; %bb.0: ; %entry 52; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 53; GFX11-NEXT: s_mov_b32 s6, -1 54; GFX11-NEXT: s_mov_b32 s7, 0x31016000 55; GFX11-NEXT: s_mov_b32 s10, s6 56; GFX11-NEXT: s_mov_b32 s11, s7 57; GFX11-NEXT: s_waitcnt lgkmcnt(0) 58; GFX11-NEXT: s_mov_b32 s8, s2 59; GFX11-NEXT: s_mov_b32 s9, s3 60; GFX11-NEXT: s_mov_b32 s4, s0 61; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 62; GFX11-NEXT: s_mov_b32 s5, s1 63; GFX11-NEXT: s_waitcnt vmcnt(0) 64; GFX11-NEXT: v_rndne_f16_e32 v0, v0 65; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 66; GFX11-NEXT: s_endpgm 67; 68; GFX12-LABEL: rint_f16: 69; GFX12: ; %bb.0: ; %entry 70; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 71; GFX12-NEXT: s_mov_b32 s6, -1 72; GFX12-NEXT: s_mov_b32 s7, 0x31016000 73; GFX12-NEXT: s_mov_b32 s10, s6 74; GFX12-NEXT: s_mov_b32 s11, s7 75; GFX12-NEXT: s_wait_kmcnt 0x0 76; GFX12-NEXT: s_mov_b32 s8, s2 77; GFX12-NEXT: s_mov_b32 s9, s3 78; GFX12-NEXT: s_mov_b32 s4, s0 79; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null 80; GFX12-NEXT: s_mov_b32 s5, s1 81; GFX12-NEXT: s_wait_loadcnt 0x0 82; GFX12-NEXT: v_rndne_f16_e32 v0, v0 83; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null 84; GFX12-NEXT: s_endpgm 85 ptr addrspace(1) %r, 86 ptr addrspace(1) %a) { 87entry: 88 %a.val = load half, ptr addrspace(1) %a 89 %r.val = call half @llvm.rint.f16(half %a.val) 90 store half %r.val, ptr addrspace(1) %r 91 ret void 92} 93 94; The original test with manual checks also had these NOT directives: 95; COM: SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 96; COM: SI-NOT: v_and_b32 97; COM: SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 98; COM: VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] 99; COM: VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 100; COM: VI-NOT: v_and_b32 101; COM: VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 102define amdgpu_kernel void @rint_v2f16( 103; SI-LABEL: rint_v2f16: 104; SI: ; %bb.0: ; %entry 105; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 106; SI-NEXT: s_mov_b32 s7, 0xf000 107; SI-NEXT: s_mov_b32 s6, -1 108; SI-NEXT: s_mov_b32 s10, s6 109; SI-NEXT: s_mov_b32 s11, s7 110; SI-NEXT: s_waitcnt lgkmcnt(0) 111; SI-NEXT: s_mov_b32 s8, s2 112; SI-NEXT: s_mov_b32 s9, s3 113; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 114; SI-NEXT: s_mov_b32 s4, s0 115; SI-NEXT: s_mov_b32 s5, s1 116; SI-NEXT: s_waitcnt vmcnt(0) 117; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 118; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 119; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 120; SI-NEXT: v_rndne_f32_e32 v1, v1 121; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 122; SI-NEXT: v_rndne_f32_e32 v0, v0 123; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 124; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 125; SI-NEXT: v_or_b32_e32 v0, v0, v1 126; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 127; SI-NEXT: s_endpgm 128; 129; VI-LABEL: rint_v2f16: 130; VI: ; %bb.0: ; %entry 131; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 132; VI-NEXT: s_mov_b32 s7, 0xf000 133; VI-NEXT: s_mov_b32 s6, -1 134; VI-NEXT: s_mov_b32 s10, s6 135; VI-NEXT: s_mov_b32 s11, s7 136; VI-NEXT: s_waitcnt lgkmcnt(0) 137; VI-NEXT: s_mov_b32 s8, s2 138; VI-NEXT: s_mov_b32 s9, s3 139; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 140; VI-NEXT: s_mov_b32 s4, s0 141; VI-NEXT: s_mov_b32 s5, s1 142; VI-NEXT: s_waitcnt vmcnt(0) 143; VI-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 144; VI-NEXT: v_rndne_f16_e32 v0, v0 145; VI-NEXT: v_or_b32_e32 v0, v0, v1 146; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 147; VI-NEXT: s_endpgm 148; 149; GFX9-LABEL: rint_v2f16: 150; GFX9: ; %bb.0: ; %entry 151; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 152; GFX9-NEXT: s_mov_b32 s7, 0xf000 153; GFX9-NEXT: s_mov_b32 s6, -1 154; GFX9-NEXT: s_mov_b32 s10, s6 155; GFX9-NEXT: s_mov_b32 s11, s7 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: s_mov_b32 s8, s2 158; GFX9-NEXT: s_mov_b32 s9, s3 159; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 160; GFX9-NEXT: s_mov_b32 s4, s0 161; GFX9-NEXT: s_mov_b32 s5, s1 162; GFX9-NEXT: s_waitcnt vmcnt(0) 163; GFX9-NEXT: v_rndne_f16_e32 v1, v0 164; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 165; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 166; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 167; GFX9-NEXT: s_endpgm 168; 169; GFX11-LABEL: rint_v2f16: 170; GFX11: ; %bb.0: ; %entry 171; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 172; GFX11-NEXT: s_mov_b32 s6, -1 173; GFX11-NEXT: s_mov_b32 s7, 0x31016000 174; GFX11-NEXT: s_mov_b32 s10, s6 175; GFX11-NEXT: s_mov_b32 s11, s7 176; GFX11-NEXT: s_waitcnt lgkmcnt(0) 177; GFX11-NEXT: s_mov_b32 s8, s2 178; GFX11-NEXT: s_mov_b32 s9, s3 179; GFX11-NEXT: s_mov_b32 s4, s0 180; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 181; GFX11-NEXT: s_mov_b32 s5, s1 182; GFX11-NEXT: s_waitcnt vmcnt(0) 183; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 184; GFX11-NEXT: v_rndne_f16_e32 v0, v0 185; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 186; GFX11-NEXT: v_rndne_f16_e32 v1, v1 187; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 188; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 189; GFX11-NEXT: s_endpgm 190; 191; GFX12-LABEL: rint_v2f16: 192; GFX12: ; %bb.0: ; %entry 193; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 194; GFX12-NEXT: s_mov_b32 s6, -1 195; GFX12-NEXT: s_mov_b32 s7, 0x31016000 196; GFX12-NEXT: s_mov_b32 s10, s6 197; GFX12-NEXT: s_mov_b32 s11, s7 198; GFX12-NEXT: s_wait_kmcnt 0x0 199; GFX12-NEXT: s_mov_b32 s8, s2 200; GFX12-NEXT: s_mov_b32 s9, s3 201; GFX12-NEXT: s_mov_b32 s4, s0 202; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 203; GFX12-NEXT: s_mov_b32 s5, s1 204; GFX12-NEXT: s_wait_loadcnt 0x0 205; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 206; GFX12-NEXT: v_rndne_f16_e32 v0, v0 207; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 208; GFX12-NEXT: v_rndne_f16_e32 v1, v1 209; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 210; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null 211; GFX12-NEXT: s_endpgm 212 ptr addrspace(1) %r, 213 ptr addrspace(1) %a) { 214entry: 215 %a.val = load <2 x half>, ptr addrspace(1) %a 216 %r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val) 217 store <2 x half> %r.val, ptr addrspace(1) %r 218 ret void 219} 220