1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s 6; FIXME: promotion not handled without f16 insts 7 8define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 { 9; GCN-LABEL: v_constained_fadd_f16_fpexcept_strict: 10; GCN: ; %bb.0: 11; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GCN-NEXT: v_add_f16_e32 v0, v0, v1 13; GCN-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_strict: 16; GFX10PLUS: ; %bb.0: 17; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 19; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 20 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 21 ret half %val 22} 23 24define half @v_constained_fadd_f16_fpexcept_ignore(half %x, half %y) #0 { 25; GCN-LABEL: v_constained_fadd_f16_fpexcept_ignore: 26; GCN: ; %bb.0: 27; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GCN-NEXT: v_add_f16_e32 v0, v0, v1 29; GCN-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_ignore: 32; GFX10PLUS: ; %bb.0: 33; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 35; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 36 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 37 ret half %val 38} 39 40define half @v_constained_fadd_f16_fpexcept_maytrap(half %x, half %y) #0 { 41; GCN-LABEL: v_constained_fadd_f16_fpexcept_maytrap: 42; GCN: ; %bb.0: 43; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 44; GCN-NEXT: v_add_f16_e32 v0, v0, v1 45; GCN-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_maytrap: 48; GFX10PLUS: ; %bb.0: 49; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 51; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 52 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 53 ret half %val 54} 55 56define <2 x half> @v_constained_fadd_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 { 57; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_strict: 58; GFX9: ; %bb.0: 59; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 61; GFX9-NEXT: s_setpc_b64 s[30:31] 62; 63; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_strict: 64; GFX8: ; %bb.0: 65; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 66; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 67; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 68; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 69; GFX8-NEXT: s_setpc_b64 s[30:31] 70; 71; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_strict: 72; GFX10PLUS: ; %bb.0: 73; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 74; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 75; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 76 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 77 ret <2 x half> %val 78} 79 80define <2 x half> @v_constained_fadd_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 { 81; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: 82; GFX9: ; %bb.0: 83; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 84; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 85; GFX9-NEXT: s_setpc_b64 s[30:31] 86; 87; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: 88; GFX8: ; %bb.0: 89; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 90; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 91; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 92; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 93; GFX8-NEXT: s_setpc_b64 s[30:31] 94; 95; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: 96; GFX10PLUS: ; %bb.0: 97; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 98; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 99; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 100 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 101 ret <2 x half> %val 102} 103 104define <2 x half> @v_constained_fadd_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 { 105; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 109; GFX9-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: 112; GFX8: ; %bb.0: 113; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 115; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 116; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 117; GFX8-NEXT: s_setpc_b64 s[30:31] 118; 119; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: 120; GFX10PLUS: ; %bb.0: 121; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 123; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 124 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 125 ret <2 x half> %val 126} 127 128define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 { 129; GFX9-LABEL: v_constained_fadd_v3f16_fpexcept_strict: 130; GFX9: ; %bb.0: 131; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 132; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 133; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 134; GFX9-NEXT: s_setpc_b64 s[30:31] 135; 136; GFX8-LABEL: v_constained_fadd_v3f16_fpexcept_strict: 137; GFX8: ; %bb.0: 138; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 139; GFX8-NEXT: v_add_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 140; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 141; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 142; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 143; GFX8-NEXT: s_setpc_b64 s[30:31] 144; 145; GFX10PLUS-LABEL: v_constained_fadd_v3f16_fpexcept_strict: 146; GFX10PLUS: ; %bb.0: 147; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 148; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v2 149; GFX10PLUS-NEXT: v_add_f16_e32 v1, v1, v3 150; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 151 %val = call <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 152 ret <3 x half> %val 153} 154 155; FIXME: Scalarized 156define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 { 157; GFX9-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 158; GFX9: ; %bb.0: 159; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 160; GFX9-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 161; GFX9-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 162; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 163; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 164; GFX9-NEXT: s_mov_b32 s4, 0x5040100 165; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 166; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 167; GFX9-NEXT: s_setpc_b64 s[30:31] 168; 169; GFX8-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 170; GFX8: ; %bb.0: 171; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 172; GFX8-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 173; GFX8-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 174; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 175; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 176; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 177; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 178; GFX8-NEXT: s_setpc_b64 s[30:31] 179; 180; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 181; GFX10: ; %bb.0: 182; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183; GFX10-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 184; GFX10-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 185; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 186; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 187; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 188; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 189; GFX10-NEXT: s_setpc_b64 s[30:31] 190; 191; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: 192; GFX11: ; %bb.0: 193; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 195; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 196; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 197; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 198; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 199; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 200; GFX11-NEXT: v_add_f16_e32 v2, v6, v5 201; GFX11-NEXT: v_add_f16_e32 v3, v7, v4 202; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 203; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 204; GFX11-NEXT: s_setpc_b64 s[30:31] 205 %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 206 ret <4 x half> %val 207} 208 209define amdgpu_ps half @s_constained_fadd_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 { 210; GCN-LABEL: s_constained_fadd_f16_fpexcept_strict: 211; GCN: ; %bb.0: 212; GCN-NEXT: v_mov_b32_e32 v0, s3 213; GCN-NEXT: v_add_f16_e32 v0, s2, v0 214; GCN-NEXT: ; return to shader part epilog 215; 216; GFX10PLUS-LABEL: s_constained_fadd_f16_fpexcept_strict: 217; GFX10PLUS: ; %bb.0: 218; GFX10PLUS-NEXT: v_add_f16_e64 v0, s2, s3 219; GFX10PLUS-NEXT: ; return to shader part epilog 220 %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 221 ret half %val 222} 223 224define amdgpu_ps <2 x half> @s_constained_fadd_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 { 225; GFX9-LABEL: s_constained_fadd_v2f16_fpexcept_strict: 226; GFX9: ; %bb.0: 227; GFX9-NEXT: v_mov_b32_e32 v0, s3 228; GFX9-NEXT: v_pk_add_f16 v0, s2, v0 229; GFX9-NEXT: ; return to shader part epilog 230; 231; GFX8-LABEL: s_constained_fadd_v2f16_fpexcept_strict: 232; GFX8: ; %bb.0: 233; GFX8-NEXT: s_lshr_b32 s0, s3, 16 234; GFX8-NEXT: s_lshr_b32 s1, s2, 16 235; GFX8-NEXT: v_mov_b32_e32 v0, s0 236; GFX8-NEXT: v_mov_b32_e32 v1, s1 237; GFX8-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 238; GFX8-NEXT: v_mov_b32_e32 v1, s3 239; GFX8-NEXT: v_add_f16_e32 v1, s2, v1 240; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 241; GFX8-NEXT: ; return to shader part epilog 242; 243; GFX10PLUS-LABEL: s_constained_fadd_v2f16_fpexcept_strict: 244; GFX10PLUS: ; %bb.0: 245; GFX10PLUS-NEXT: v_pk_add_f16 v0, s2, s3 246; GFX10PLUS-NEXT: ; return to shader part epilog 247 %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 248 ret <2 x half> %val 249} 250 251declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) #1 252declare <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1 253declare <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1 254declare <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1 255 256attributes #0 = { strictfp } 257attributes #1 = { inaccessiblememonly nounwind willreturn } 258