1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s 3; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s 5 6define amdgpu_gs half @v_fptrunc_round_f32_to_f16_tonearest(float %a) { 7; CHECK-LABEL: v_fptrunc_round_f32_to_f16_tonearest: 8; CHECK: ; %bb.0: 9; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 10; CHECK-NEXT: ; return to shader part epilog 11 %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearest") 12 ret half %res 13} 14 15define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) { 16; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward: 17; CHECK: ; %bb.0: 18; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 19; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 20; CHECK-NEXT: ; return to shader part epilog 21 %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward") 22 ret half %res 23} 24 25define amdgpu_gs half @v_fptrunc_round_f32_to_f16_downward(float %a) { 26; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward: 27; CHECK: ; %bb.0: 28; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 29; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 30; CHECK-NEXT: ; return to shader part epilog 31 %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward") 32 ret half %res 33} 34 35define amdgpu_gs half @v_fptrunc_round_f32_to_f16_towardzero(float %a) { 36; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero: 37; CHECK: ; %bb.0: 38; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 39; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 40; CHECK-NEXT: ; return to shader part epilog 41 %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero") 42 ret half %res 43} 44 45define amdgpu_gs void @v_fptrunc_round_f32_to_f16_upward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) { 46; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls: 47; CHECK: ; %bb.0: 48; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 49; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 50; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v1 51; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 52; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 53; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 54; CHECK-NEXT: v_add_f16_e32 v0, v0, v4 55; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 56; CHECK-NEXT: global_store_short v[2:3], v0, off 57; CHECK-NEXT: s_endpgm 58 %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward") 59 %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward") 60 %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward") 61 %res4 = fadd half %res1, %res2 62 %res5 = fadd half %res3, %res4 63 store half %res5, ptr addrspace(1) %out, align 4 64 ret void 65} 66 67define amdgpu_gs void @v_fptrunc_round_f32_to_f16_downward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) { 68; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls: 69; CHECK: ; %bb.0: 70; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 71; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v0 72; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 73; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 74; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 75; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 76; CHECK-NEXT: v_add_f16_e32 v0, v4, v0 77; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 78; CHECK-NEXT: global_store_short v[2:3], v0, off 79; CHECK-NEXT: s_endpgm 80 %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward") 81 %res2 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward") 82 %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward") 83 %res4 = fadd half %res1, %res2 84 %res5 = fadd half %res3, %res4 85 store half %res5, ptr addrspace(1) %out, align 4 86 ret void 87} 88 89define amdgpu_gs void @v_fptrunc_round_f32_to_f16_towardzero_multiple_calls(float %a, float %b, ptr addrspace(1) %out) { 90; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls: 91; CHECK: ; %bb.0: 92; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 93; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 94; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v1 95; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1 96; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 97; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 0 98; CHECK-NEXT: v_add_f16_e32 v0, v0, v4 99; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 100; CHECK-NEXT: global_store_short v[2:3], v0, off 101; CHECK-NEXT: s_endpgm 102 %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero") 103 %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.towardzero") 104 %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward") 105 %res4 = fadd half %res1, %res2 106 %res5 = fadd half %res3, %res4 107 store half %res5, ptr addrspace(1) %out, align 4 108 ret void 109} 110 111define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addrspace(1) %out) { 112; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward: 113; CHECK: ; %bb.0: 114; CHECK-NEXT: v_mov_b32_e32 v0, s0 115; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 116; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 117; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 118; CHECK-NEXT: v_readfirstlane_b32 s0, v0 119; CHECK-NEXT: ; return to shader part epilog 120 %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward") 121 %bitcast = bitcast half %res to i16 122 %ret = zext i16 %bitcast to i32 123 ret i32 %ret 124} 125 126define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr addrspace(1) %out) { 127; CHECK-LABEL: s_fptrunc_round_f32_to_f16_downward: 128; CHECK: ; %bb.0: 129; CHECK-NEXT: v_mov_b32_e32 v0, s0 130; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 131; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 132; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 133; CHECK-NEXT: v_readfirstlane_b32 s0, v0 134; CHECK-NEXT: ; return to shader part epilog 135 %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward") 136 %bitcast = bitcast half %res to i16 137 %ret = zext i16 %bitcast to i32 138 ret i32 %ret 139} 140 141define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float inreg %a, float inreg %b, ptr addrspace(1) %out) { 142; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls: 143; CHECK: ; %bb.0: 144; CHECK-NEXT: v_mov_b32_e32 v2, s0 145; CHECK-NEXT: v_mov_b32_e32 v3, s1 146; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 147; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 148; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v3 149; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 150; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3 151; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 152; CHECK-NEXT: v_add_f16_e32 v2, v2, v4 153; CHECK-NEXT: v_add_f16_e32 v2, v3, v2 154; CHECK-NEXT: global_store_short v[0:1], v2, off 155; CHECK-NEXT: s_endpgm 156 %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward") 157 %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward") 158 %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward") 159 %res4 = fadd half %res1, %res2 160 %res5 = fadd half %res3, %res4 161 store half %res5, ptr addrspace(1) %out, align 4 162 ret void 163} 164 165define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %a) { 166; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward: 167; SDAG: ; %bb.0: 168; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 169; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 170; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 171; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 172; SDAG-NEXT: ; return to shader part epilog 173; 174; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward: 175; GISEL: ; %bb.0: 176; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 177; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 178; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 179; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 180; GISEL-NEXT: ; return to shader part epilog 181 %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") 182 ret <2 x half> %res 183} 184 185define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> %a) { 186; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward: 187; SDAG: ; %bb.0: 188; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 189; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 190; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 191; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 192; SDAG-NEXT: ; return to shader part epilog 193; 194; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward: 195; GISEL: ; %bb.0: 196; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 197; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 198; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 199; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 200; GISEL-NEXT: ; return to shader part epilog 201 %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward") 202 ret <2 x half> %res 203} 204 205define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> %a, <2 x float> %b, ptr addrspace(1) %out) { 206; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: 207; SDAG: ; %bb.0: 208; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 209; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 210; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 211; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v2 212; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v3 213; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 214; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 215; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 216; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v3 217; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 218; SDAG-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 219; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 220; SDAG-NEXT: v_pk_add_f16 v0, v0, v3 221; SDAG-NEXT: v_pk_add_f16 v0, v1, v0 222; SDAG-NEXT: global_store_dword v[4:5], v0, off 223; SDAG-NEXT: s_endpgm 224; 225; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: 226; GISEL: ; %bb.0: 227; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 228; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 229; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 230; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2 231; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v3 232; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 233; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 234; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 235; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v3 236; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7 237; GISEL-NEXT: v_pack_b32_f16 v1, v1, v2 238; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 239; GISEL-NEXT: v_pk_add_f16 v0, v0, v3 240; GISEL-NEXT: v_pk_add_f16 v0, v1, v0 241; GISEL-NEXT: global_store_dword v[4:5], v0, off 242; GISEL-NEXT: s_endpgm 243 %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") 244 %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward") 245 %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward") 246 %res4 = fadd <2 x half> %res1, %res2 247 %res5 = fadd <2 x half> %res3, %res4 248 store <2 x half> %res5, ptr addrspace(1) %out, align 4 249 ret void 250} 251 252define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) { 253; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward: 254; CHECK: ; %bb.0: 255; CHECK-NEXT: v_mov_b32_e32 v0, s0 256; CHECK-NEXT: v_mov_b32_e32 v1, s1 257; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 258; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 259; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 260; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 261; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 262; CHECK-NEXT: v_readfirstlane_b32 s0, v0 263; CHECK-NEXT: v_readfirstlane_b32 s1, v1 264; CHECK-NEXT: ; return to shader part epilog 265 %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") 266 %bitcast = bitcast <2 x half> %res to <2 x i16> 267 %ret = zext <2 x i16> %bitcast to <2 x i32> 268 ret <2 x i32> %ret 269} 270 271define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) { 272; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward: 273; CHECK: ; %bb.0: 274; CHECK-NEXT: v_mov_b32_e32 v0, s0 275; CHECK-NEXT: v_mov_b32_e32 v1, s1 276; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 277; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 278; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 279; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 280; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 281; CHECK-NEXT: v_readfirstlane_b32 s0, v0 282; CHECK-NEXT: v_readfirstlane_b32 s1, v1 283; CHECK-NEXT: ; return to shader part epilog 284 %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward") 285 %bitcast = bitcast <2 x half> %res to <2 x i16> 286 %ret = zext <2 x i16> %bitcast to <2 x i32> 287 ret <2 x i32> %ret 288} 289 290define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) { 291; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: 292; SDAG: ; %bb.0: 293; SDAG-NEXT: v_mov_b32_e32 v2, s0 294; SDAG-NEXT: v_mov_b32_e32 v3, s2 295; SDAG-NEXT: v_mov_b32_e32 v4, s1 296; SDAG-NEXT: v_mov_b32_e32 v5, s3 297; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 298; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 299; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v3 300; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 301; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v5 302; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 303; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 304; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 305; SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 306; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 307; SDAG-NEXT: v_lshl_or_b32 v2, v4, 16, v2 308; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v5 309; SDAG-NEXT: v_lshl_or_b32 v5, v7, 16, v6 310; SDAG-NEXT: v_lshl_or_b32 v3, v4, 16, v3 311; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 312; SDAG-NEXT: v_pk_add_f16 v2, v2, v5 313; SDAG-NEXT: v_pk_add_f16 v2, v3, v2 314; SDAG-NEXT: global_store_dword v[0:1], v2, off 315; SDAG-NEXT: s_endpgm 316; 317; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: 318; GISEL: ; %bb.0: 319; GISEL-NEXT: v_mov_b32_e32 v2, s0 320; GISEL-NEXT: v_mov_b32_e32 v3, s1 321; GISEL-NEXT: v_mov_b32_e32 v4, s2 322; GISEL-NEXT: v_mov_b32_e32 v5, s3 323; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 324; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 325; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 326; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v4 327; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v5 328; GISEL-NEXT: v_pack_b32_f16 v2, v2, v3 329; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 330; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v4 331; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v5 332; GISEL-NEXT: v_pack_b32_f16 v5, v6, v7 333; GISEL-NEXT: v_pack_b32_f16 v3, v3, v4 334; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 335; GISEL-NEXT: v_pk_add_f16 v2, v2, v5 336; GISEL-NEXT: v_pk_add_f16 v2, v3, v2 337; GISEL-NEXT: global_store_dword v[0:1], v2, off 338; GISEL-NEXT: s_endpgm 339 %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") 340 %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward") 341 %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward") 342 %res4 = fadd <2 x half> %res1, %res2 343 %res5 = fadd <2 x half> %res3, %res4 344 store <2 x half> %res5, ptr addrspace(1) %out, align 4 345 ret void 346} 347 348define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) { 349; SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward: 350; SDAG: ; %bb.0: 351; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 352; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 353; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 354; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 355; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 356; SDAG-NEXT: ; return to shader part epilog 357; 358; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward: 359; GISEL: ; %bb.0: 360; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 361; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 362; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 363; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 364; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 365; GISEL-NEXT: ; return to shader part epilog 366 %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward") 367 ret <3 x half> %res 368} 369 370define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) { 371; SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward: 372; SDAG: ; %bb.0: 373; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 374; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 375; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 376; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 377; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 378; SDAG-NEXT: ; return to shader part epilog 379; 380; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward: 381; GISEL: ; %bb.0: 382; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 383; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 384; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 385; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 386; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 387; GISEL-NEXT: ; return to shader part epilog 388 %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward") 389 ret <3 x half> %res 390} 391 392define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %a) { 393; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward: 394; SDAG: ; %bb.0: 395; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 396; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 397; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 398; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 399; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 400; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 401; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 402; SDAG-NEXT: ; return to shader part epilog 403; 404; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward: 405; GISEL: ; %bb.0: 406; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 407; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 408; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 409; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 410; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 411; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 412; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 413; GISEL-NEXT: ; return to shader part epilog 414 %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward") 415 ret <4 x half> %res 416} 417 418define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float> %a) { 419; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward: 420; SDAG: ; %bb.0: 421; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 422; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 423; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 424; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 425; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 426; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 427; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 428; SDAG-NEXT: ; return to shader part epilog 429; 430; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward: 431; GISEL: ; %bb.0: 432; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 433; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 434; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 435; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 436; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 437; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 438; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 439; GISEL-NEXT: ; return to shader part epilog 440 %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward") 441 ret <4 x half> %res 442} 443 444define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %a) { 445; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward: 446; SDAG: ; %bb.0: 447; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 448; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 449; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 450; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 451; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 452; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 453; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 454; SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 455; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 456; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 457; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 458; SDAG-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 459; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 460; SDAG-NEXT: ; return to shader part epilog 461; 462; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward: 463; GISEL: ; %bb.0: 464; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 465; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 466; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 467; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 468; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 469; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 470; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 471; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 472; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 473; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 474; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 475; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5 476; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7 477; GISEL-NEXT: ; return to shader part epilog 478 %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward") 479 ret <8 x half> %res 480} 481 482define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> %a) { 483; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward: 484; SDAG: ; %bb.0: 485; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 486; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 487; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 488; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 489; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 490; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 491; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 492; SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 493; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 494; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 495; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 496; SDAG-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 497; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 498; SDAG-NEXT: ; return to shader part epilog 499; 500; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward: 501; GISEL: ; %bb.0: 502; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 503; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 504; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 505; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 506; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 507; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 508; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 509; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 510; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 511; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 512; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 513; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5 514; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7 515; GISEL-NEXT: ; return to shader part epilog 516 %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward") 517 ret <8 x half> %res 518} 519 520define amdgpu_gs float @v_fptrunc_round_f64_to_f32_tonearest(double %a) { 521; CHECK-LABEL: v_fptrunc_round_f64_to_f32_tonearest: 522; CHECK: ; %bb.0: 523; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 524; CHECK-NEXT: ; return to shader part epilog 525 %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.tonearest") 526 ret float %res 527} 528 529define amdgpu_gs float @v_fptrunc_round_f64_to_f32_upward(double %a) { 530; CHECK-LABEL: v_fptrunc_round_f64_to_f32_upward: 531; CHECK: ; %bb.0: 532; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 533; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 534; CHECK-NEXT: ; return to shader part epilog 535 %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.upward") 536 ret float %res 537} 538 539define amdgpu_gs float @v_fptrunc_round_f64_to_f32_downward(double %a) { 540; CHECK-LABEL: v_fptrunc_round_f64_to_f32_downward: 541; CHECK: ; %bb.0: 542; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 543; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 544; CHECK-NEXT: ; return to shader part epilog 545 %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward") 546 ret float %res 547} 548 549define amdgpu_gs float @v_fptrunc_round_f64_to_f32_towardzero(double %a) { 550; CHECK-LABEL: v_fptrunc_round_f64_to_f32_towardzero: 551; CHECK: ; %bb.0: 552; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 553; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 554; CHECK-NEXT: ; return to shader part epilog 555 %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.towardzero") 556 ret float %res 557} 558