1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s 3; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s 4 5; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s 6; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s 7 8; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10-SDAG %s 9; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10-GISEL %s 10 11; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG %s 12; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL %s 13 14; FIXME: promotion not handled without f16 insts 15 16define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 { 17; GCN-LABEL: v_constained_fsub_f16_fpexcept_strict: 18; GCN: ; %bb.0: 19; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 21; GCN-NEXT: s_setpc_b64 s[30:31] 22; 23; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_strict: 24; GFX10PLUS: ; %bb.0: 25; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 27; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 28 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 29 ret half %val 30} 31 32define half @v_constained_fsub_f16_fpexcept_ignore(half %x, half %y) #0 { 33; GCN-LABEL: v_constained_fsub_f16_fpexcept_ignore: 34; GCN: ; %bb.0: 35; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 37; GCN-NEXT: s_setpc_b64 s[30:31] 38; 39; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_ignore: 40; GFX10PLUS: ; %bb.0: 41; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 42; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 43; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 44 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 45 ret half %val 46} 47 48define half @v_constained_fsub_f16_fpexcept_maytrap(half %x, half %y) #0 { 49; GCN-LABEL: v_constained_fsub_f16_fpexcept_maytrap: 50; GCN: ; %bb.0: 51; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 53; GCN-NEXT: s_setpc_b64 s[30:31] 54; 55; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_maytrap: 56; GFX10PLUS: ; %bb.0: 57; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 58; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 59; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 60 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 61 ret half %val 62} 63 64define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 { 65; GFX9-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 66; GFX9-SDAG: ; %bb.0: 67; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; GFX9-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 69; GFX9-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 70; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 71; GFX9-SDAG-NEXT: v_perm_b32 v0, v2, v0, s4 72; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX9-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 75; GFX9-GISEL: ; %bb.0: 76; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 78; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 79; 80; GFX8-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 81; GFX8-SDAG: ; %bb.0: 82; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX8-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 84; GFX8-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 85; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 86; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] 87; 88; GFX8-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 89; GFX8-GISEL: ; %bb.0: 90; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 91; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 92; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1 93; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 94; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 95; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] 96; 97; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 98; GFX10-SDAG: ; %bb.0: 99; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; GFX10-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 101; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 102; GFX10-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 103; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 104; 105; GFX10-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 106; GFX10-GISEL: ; %bb.0: 107; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 109; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 112; GFX10PLUS-SDAG: ; %bb.0: 113; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 115; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 116; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 117; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v3, v2 118; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 119; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] 120; 121; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict: 122; GFX10PLUS-GISEL: ; %bb.0: 123; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 125; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] 126 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 127 ret <2 x half> %val 128} 129 130define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 { 131; GFX9-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 132; GFX9-SDAG: ; %bb.0: 133; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX9-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 135; GFX9-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 136; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 137; GFX9-SDAG-NEXT: v_perm_b32 v0, v2, v0, s4 138; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] 139; 140; GFX9-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 141; GFX9-GISEL: ; %bb.0: 142; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 143; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 144; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 145; 146; GFX8-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 147; GFX8-SDAG: ; %bb.0: 148; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX8-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 150; GFX8-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 151; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 152; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] 153; 154; GFX8-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 155; GFX8-GISEL: ; %bb.0: 156; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 158; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1 159; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 160; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 161; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 164; GFX10-SDAG: ; %bb.0: 165; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX10-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 167; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 168; GFX10-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 169; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 170; 171; GFX10-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 172; GFX10-GISEL: ; %bb.0: 173; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 175; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 178; GFX10PLUS-SDAG: ; %bb.0: 179; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 181; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 182; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 183; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v3, v2 184; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 185; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: 188; GFX10PLUS-GISEL: ; %bb.0: 189; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 191; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] 192 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 193 ret <2 x half> %val 194} 195 196define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 { 197; GFX9-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 198; GFX9-SDAG: ; %bb.0: 199; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 200; GFX9-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 201; GFX9-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 202; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 203; GFX9-SDAG-NEXT: v_perm_b32 v0, v2, v0, s4 204; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX9-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 207; GFX9-GISEL: ; %bb.0: 208; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 210; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 211; 212; GFX8-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 213; GFX8-SDAG: ; %bb.0: 214; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; GFX8-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 216; GFX8-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 217; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 218; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] 219; 220; GFX8-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 221; GFX8-GISEL: ; %bb.0: 222; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 223; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 224; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1 225; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 226; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 227; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] 228; 229; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 230; GFX10-SDAG: ; %bb.0: 231; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX10-SDAG-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 233; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 234; GFX10-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 235; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX10-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 238; GFX10-GISEL: ; %bb.0: 239; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 241; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] 242; 243; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 244; GFX10PLUS-SDAG: ; %bb.0: 245; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 247; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 248; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 249; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v3, v2 250; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 251; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] 252; 253; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: 254; GFX10PLUS-GISEL: ; %bb.0: 255; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 256; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 257; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] 258 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 259 ret <2 x half> %val 260} 261 262define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 { 263; GFX9-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 264; GFX9-SDAG: ; %bb.0: 265; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 266; GFX9-SDAG-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 267; GFX9-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 268; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 269; GFX9-SDAG-NEXT: v_perm_b32 v0, v4, v0, s4 270; GFX9-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 271; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX9-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 274; GFX9-GISEL: ; %bb.0: 275; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX9-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 277; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 278; GFX9-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 279; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4 280; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 281; 282; GFX8-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 283; GFX8-SDAG: ; %bb.0: 284; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 285; GFX8-SDAG-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 286; GFX8-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 287; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 288; GFX8-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 289; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] 290; 291; GFX8-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 292; GFX8-GISEL: ; %bb.0: 293; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 295; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 296; GFX8-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 297; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 298; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] 299; 300; GFX10-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 301; GFX10-SDAG: ; %bb.0: 302; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 303; GFX10-SDAG-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 304; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 305; GFX10-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 306; GFX10-SDAG-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 307; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 308; 309; GFX10-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 310; GFX10-GISEL: ; %bb.0: 311; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 313; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 314; GFX10-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 315; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 316; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 317; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] 318; 319; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 320; GFX10PLUS-SDAG: ; %bb.0: 321; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 322; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 323; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 324; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 325; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 326; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v5, v4 327; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 328; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] 329; 330; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: 331; GFX10PLUS-GISEL: ; %bb.0: 332; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 333; GFX10PLUS-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 334; GFX10PLUS-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 335; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v0, v0, v2 336; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 337; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v2, v4, v5 338; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 339; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 340; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] 341 %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 342 ret <3 x half> %val 343} 344 345; FIXME: Scalarized 346define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 { 347; GFX9-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 348; GFX9-SDAG: ; %bb.0: 349; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 350; GFX9-SDAG-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 351; GFX9-SDAG-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 352; GFX9-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 353; GFX9-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 354; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 355; GFX9-SDAG-NEXT: v_perm_b32 v0, v5, v0, s4 356; GFX9-SDAG-NEXT: v_perm_b32 v1, v4, v1, s4 357; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] 358; 359; GFX9-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 360; GFX9-GISEL: ; %bb.0: 361; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GFX9-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 363; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 364; GFX9-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 365; GFX9-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 366; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4 367; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2 368; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 369; 370; GFX8-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 371; GFX8-SDAG: ; %bb.0: 372; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 373; GFX8-SDAG-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 374; GFX8-SDAG-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 375; GFX8-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 376; GFX8-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 377; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5 378; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 379; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] 380; 381; GFX8-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 382; GFX8-GISEL: ; %bb.0: 383; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 384; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 385; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 386; GFX8-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 387; GFX8-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 388; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 389; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v2, v1 390; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] 391; 392; GFX10-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 393; GFX10-SDAG: ; %bb.0: 394; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 395; GFX10-SDAG-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 396; GFX10-SDAG-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 397; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 398; GFX10-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 399; GFX10-SDAG-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 400; GFX10-SDAG-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 401; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 402; 403; GFX10-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 404; GFX10-GISEL: ; %bb.0: 405; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 406; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 407; GFX10-GISEL-NEXT: v_sub_f16_e32 v5, v1, v3 408; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 409; GFX10-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 410; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 411; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 412; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 413; GFX10-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 414; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] 415; 416; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 417; GFX10PLUS-SDAG: ; %bb.0: 418; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 420; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 421; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 422; GFX10PLUS-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 423; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 424; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 425; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v6, v5 426; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v3, v7, v4 427; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 428; GFX10PLUS-SDAG-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 429; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] 430; 431; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: 432; GFX10PLUS-GISEL: ; %bb.0: 433; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 434; GFX10PLUS-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 435; GFX10PLUS-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 436; GFX10PLUS-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 437; GFX10PLUS-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 438; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v0, v0, v2 439; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 440; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v2, v4, v6 441; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v3, v5, v7 442; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 443; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 444; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 445; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1 446; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31] 447 %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 448 ret <4 x half> %val 449} 450 451define amdgpu_ps half @s_constained_fsub_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 { 452; GCN-LABEL: s_constained_fsub_f16_fpexcept_strict: 453; GCN: ; %bb.0: 454; GCN-NEXT: v_mov_b32_e32 v0, s3 455; GCN-NEXT: v_sub_f16_e32 v0, s2, v0 456; GCN-NEXT: ; return to shader part epilog 457; 458; GFX10PLUS-LABEL: s_constained_fsub_f16_fpexcept_strict: 459; GFX10PLUS: ; %bb.0: 460; GFX10PLUS-NEXT: v_sub_f16_e64 v0, s2, s3 461; GFX10PLUS-NEXT: ; return to shader part epilog 462 %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 463 ret half %val 464} 465 466define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 { 467; GFX9-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 468; GFX9-SDAG: ; %bb.0: 469; GFX9-SDAG-NEXT: s_lshr_b32 s0, s3, 16 470; GFX9-SDAG-NEXT: s_lshr_b32 s1, s2, 16 471; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 472; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3 473; GFX9-SDAG-NEXT: v_sub_f16_e32 v0, s1, v0 474; GFX9-SDAG-NEXT: v_sub_f16_e32 v1, s2, v1 475; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v0, 16, v1 476; GFX9-SDAG-NEXT: ; return to shader part epilog 477; 478; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 479; GFX9-GISEL: ; %bb.0: 480; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 481; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 482; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 483; GFX9-GISEL-NEXT: ; return to shader part epilog 484; 485; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 486; GFX8-SDAG: ; %bb.0: 487; GFX8-SDAG-NEXT: s_lshr_b32 s0, s3, 16 488; GFX8-SDAG-NEXT: s_lshr_b32 s1, s2, 16 489; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s0 490; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, s1 491; GFX8-SDAG-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 492; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, s3 493; GFX8-SDAG-NEXT: v_sub_f16_e32 v1, s2, v1 494; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v1, v0 495; GFX8-SDAG-NEXT: ; return to shader part epilog 496; 497; GFX8-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 498; GFX8-GISEL: ; %bb.0: 499; GFX8-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 500; GFX8-GISEL-NEXT: s_lshr_b32 s1, s2, 16 501; GFX8-GISEL-NEXT: s_lshr_b32 s3, s0, 16 502; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 503; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s3 504; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1 505; GFX8-GISEL-NEXT: v_add_f16_e32 v0, s2, v0 506; GFX8-GISEL-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 507; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 508; GFX8-GISEL-NEXT: ; return to shader part epilog 509; 510; GFX10-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 511; GFX10-SDAG: ; %bb.0: 512; GFX10-SDAG-NEXT: v_sub_f16_e64 v0, s2, s3 513; GFX10-SDAG-NEXT: s_lshr_b32 s0, s3, 16 514; GFX10-SDAG-NEXT: s_lshr_b32 s1, s2, 16 515; GFX10-SDAG-NEXT: v_sub_f16_e64 v1, s1, s0 516; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 517; GFX10-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 518; GFX10-SDAG-NEXT: ; return to shader part epilog 519; 520; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 521; GFX10-GISEL: ; %bb.0: 522; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 523; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 524; GFX10-GISEL-NEXT: ; return to shader part epilog 525; 526; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 527; GFX10PLUS-SDAG: ; %bb.0: 528; GFX10PLUS-SDAG-NEXT: v_sub_f16_e64 v0, s2, s3 529; GFX10PLUS-SDAG-NEXT: s_lshr_b32 s0, s3, 16 530; GFX10PLUS-SDAG-NEXT: s_lshr_b32 s1, s2, 16 531; GFX10PLUS-SDAG-NEXT: v_sub_f16_e64 v1, s1, s0 532; GFX10PLUS-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 533; GFX10PLUS-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 534; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog 535; 536; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: 537; GFX10PLUS-GISEL: ; %bb.0: 538; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 539; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 540; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog 541 %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 542 ret <2 x half> %val 543} 544 545declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) #1 546declare <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1 547declare <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1 548declare <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1 549 550attributes #0 = { strictfp } 551attributes #1 = { inaccessiblememonly nounwind willreturn } 552;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 553; GFX8: {{.*}} 554; GFX9: {{.*}} 555