1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s 6 7declare half @llvm.floor.f16(half %a) 8declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) 9 10define amdgpu_kernel void @floor_f16( 11; SI-LABEL: floor_f16: 12; SI: ; %bb.0: ; %entry 13; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 14; SI-NEXT: s_mov_b32 s7, 0xf000 15; SI-NEXT: s_mov_b32 s6, -1 16; SI-NEXT: s_mov_b32 s10, s6 17; SI-NEXT: s_mov_b32 s11, s7 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: s_mov_b32 s8, s2 20; SI-NEXT: s_mov_b32 s9, s3 21; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 22; SI-NEXT: s_mov_b32 s4, s0 23; SI-NEXT: s_mov_b32 s5, s1 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 26; SI-NEXT: v_floor_f32_e32 v0, v0 27; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 28; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 29; SI-NEXT: s_endpgm 30; 31; VI-LABEL: floor_f16: 32; VI: ; %bb.0: ; %entry 33; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 34; VI-NEXT: s_mov_b32 s7, 0xf000 35; VI-NEXT: s_mov_b32 s6, -1 36; VI-NEXT: s_mov_b32 s10, s6 37; VI-NEXT: s_mov_b32 s11, s7 38; VI-NEXT: s_waitcnt lgkmcnt(0) 39; VI-NEXT: s_mov_b32 s8, s2 40; VI-NEXT: s_mov_b32 s9, s3 41; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 42; VI-NEXT: s_mov_b32 s4, s0 43; VI-NEXT: s_mov_b32 s5, s1 44; VI-NEXT: s_waitcnt vmcnt(0) 45; VI-NEXT: v_floor_f16_e32 v0, v0 46; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48; 49; GFX11-LABEL: floor_f16: 50; GFX11: ; %bb.0: ; %entry 51; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 52; GFX11-NEXT: s_mov_b32 s6, -1 53; GFX11-NEXT: s_mov_b32 s7, 0x31016000 54; GFX11-NEXT: s_mov_b32 s10, s6 55; GFX11-NEXT: s_mov_b32 s11, s7 56; GFX11-NEXT: s_waitcnt lgkmcnt(0) 57; GFX11-NEXT: s_mov_b32 s8, s2 58; GFX11-NEXT: s_mov_b32 s9, s3 59; GFX11-NEXT: s_mov_b32 s4, s0 60; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 61; GFX11-NEXT: s_mov_b32 s5, s1 62; GFX11-NEXT: s_waitcnt vmcnt(0) 63; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l 64; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 65; GFX11-NEXT: s_endpgm 66; 67; GFX11-FAKE16-LABEL: floor_f16: 68; GFX11-FAKE16: ; %bb.0: ; %entry 69; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 70; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 71; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 72; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 73; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 74; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 75; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 76; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 77; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 78; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 79; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 80; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 81; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0 82; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 83; GFX11-FAKE16-NEXT: s_endpgm 84 ptr addrspace(1) %r, 85 ptr addrspace(1) %a) { 86entry: 87 %a.val = load half, ptr addrspace(1) %a 88 %r.val = call half @llvm.floor.f16(half %a.val) 89 store half %r.val, ptr addrspace(1) %r 90 ret void 91} 92 93; The original test with manual checks also had these NOT directives: 94; COM: SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 95; COM: SI-NOT: and 96; COM: SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 97; COM: VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] 98; COM: VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 99; COM: VI-NOT: and 100; COM: VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 101define amdgpu_kernel void @floor_v2f16( 102; SI-LABEL: floor_v2f16: 103; SI: ; %bb.0: ; %entry 104; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 105; SI-NEXT: s_mov_b32 s7, 0xf000 106; SI-NEXT: s_mov_b32 s6, -1 107; SI-NEXT: s_mov_b32 s10, s6 108; SI-NEXT: s_mov_b32 s11, s7 109; SI-NEXT: s_waitcnt lgkmcnt(0) 110; SI-NEXT: s_mov_b32 s8, s2 111; SI-NEXT: s_mov_b32 s9, s3 112; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 113; SI-NEXT: s_mov_b32 s4, s0 114; SI-NEXT: s_mov_b32 s5, s1 115; SI-NEXT: s_waitcnt vmcnt(0) 116; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 117; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 118; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 119; SI-NEXT: v_floor_f32_e32 v1, v1 120; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 121; SI-NEXT: v_floor_f32_e32 v0, v0 122; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 123; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 124; SI-NEXT: v_or_b32_e32 v0, v0, v1 125; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 126; SI-NEXT: s_endpgm 127; 128; VI-LABEL: floor_v2f16: 129; VI: ; %bb.0: ; %entry 130; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 131; VI-NEXT: s_mov_b32 s7, 0xf000 132; VI-NEXT: s_mov_b32 s6, -1 133; VI-NEXT: s_mov_b32 s10, s6 134; VI-NEXT: s_mov_b32 s11, s7 135; VI-NEXT: s_waitcnt lgkmcnt(0) 136; VI-NEXT: s_mov_b32 s8, s2 137; VI-NEXT: s_mov_b32 s9, s3 138; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 139; VI-NEXT: s_mov_b32 s4, s0 140; VI-NEXT: s_mov_b32 s5, s1 141; VI-NEXT: s_waitcnt vmcnt(0) 142; VI-NEXT: v_floor_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 143; VI-NEXT: v_floor_f16_e32 v0, v0 144; VI-NEXT: v_or_b32_e32 v0, v0, v1 145; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 146; VI-NEXT: s_endpgm 147; 148; GFX11-LABEL: floor_v2f16: 149; GFX11: ; %bb.0: ; %entry 150; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 151; GFX11-NEXT: s_mov_b32 s6, -1 152; GFX11-NEXT: s_mov_b32 s7, 0x31016000 153; GFX11-NEXT: s_mov_b32 s10, s6 154; GFX11-NEXT: s_mov_b32 s11, s7 155; GFX11-NEXT: s_waitcnt lgkmcnt(0) 156; GFX11-NEXT: s_mov_b32 s8, s2 157; GFX11-NEXT: s_mov_b32 s9, s3 158; GFX11-NEXT: s_mov_b32 s4, s0 159; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 160; GFX11-NEXT: s_mov_b32 s5, s1 161; GFX11-NEXT: s_waitcnt vmcnt(0) 162; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 163; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l 164; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 165; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l 166; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h 167; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 168; GFX11-NEXT: s_endpgm 169; 170; GFX11-FAKE16-LABEL: floor_v2f16: 171; GFX11-FAKE16: ; %bb.0: ; %entry 172; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 173; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 174; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 175; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 176; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 177; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 178; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 179; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 180; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 181; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 182; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 183; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 184; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 185; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0 186; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 187; GFX11-FAKE16-NEXT: v_floor_f16_e32 v1, v1 188; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 189; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 190; GFX11-FAKE16-NEXT: s_endpgm 191 ptr addrspace(1) %r, 192 ptr addrspace(1) %a) { 193entry: 194 %a.val = load <2 x half>, ptr addrspace(1) %a 195 %r.val = call <2 x half> @llvm.floor.v2f16(<2 x half> %a.val) 196 store <2 x half> %r.val, ptr addrspace(1) %r 197 ret void 198} 199