1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-SAFE %s 3; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-NSZ %s 4 5; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s 6; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s 7 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s 9; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s 10 11; -------------------------------------------------------------------------------- 12; fadd tests 13; -------------------------------------------------------------------------------- 14 15define half @v_fneg_add_f16(half %a, half %b) #0 { 16; SI-SAFE-LABEL: v_fneg_add_f16: 17; SI-SAFE: ; %bb.0: 18; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 20; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 21; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 22; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 23; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 24; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 25; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 26; 27; SI-NSZ-LABEL: v_fneg_add_f16: 28; SI-NSZ: ; %bb.0: 29; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 31; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v0, -v0 32; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 33; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 34; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 35; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 36; 37; VI-SAFE-LABEL: v_fneg_add_f16: 38; VI-SAFE: ; %bb.0: 39; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; VI-SAFE-NEXT: v_add_f16_e32 v0, v0, v1 41; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 42; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 43; 44; VI-NSZ-LABEL: v_fneg_add_f16: 45; VI-NSZ: ; %bb.0: 46; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1 48; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 49; 50; GFX11-SAFE-LABEL: v_fneg_add_f16: 51; GFX11-SAFE: ; %bb.0: 52; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 53; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v0, v1 54; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 55; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 56; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 57; 58; GFX11-NSZ-LABEL: v_fneg_add_f16: 59; GFX11-NSZ: ; %bb.0: 60; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1 62; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 63 %add = fadd half %a, %b 64 %fneg = fneg half %add 65 ret half %fneg 66} 67 68define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 { 69; SI-LABEL: v_fneg_add_store_use_add_f16: 70; SI: ; %bb.0: 71; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 72; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 73; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 74; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 75; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 76; SI-NEXT: v_add_f32_e32 v1, v0, v1 77; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 78; SI-NEXT: s_setpc_b64 s[30:31] 79; 80; VI-LABEL: v_fneg_add_store_use_add_f16: 81; VI: ; %bb.0: 82; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; VI-NEXT: v_add_f16_e32 v1, v0, v1 84; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 85; VI-NEXT: s_setpc_b64 s[30:31] 86; 87; GFX11-LABEL: v_fneg_add_store_use_add_f16: 88; GFX11: ; %bb.0: 89; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 90; GFX11-NEXT: v_add_f16_e32 v1, v0, v1 91; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 92; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 93; GFX11-NEXT: s_setpc_b64 s[30:31] 94 %add = fadd half %a, %b 95 %fneg = fneg half %add 96 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 97 %insert.1 = insertvalue { half, half } %insert.0, half %add, 1 98 ret { half, half } %insert.1 99} 100 101define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 { 102; SI-SAFE-LABEL: v_fneg_add_multi_use_add_f16: 103; SI-SAFE: ; %bb.0: 104; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 106; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 107; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 108; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 109; SI-SAFE-NEXT: v_add_f32_e32 v1, v0, v1 110; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 111; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 112; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 113; 114; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16: 115; SI-NSZ: ; %bb.0: 116; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 118; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 119; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 120; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 121; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 122; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 123; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 124; 125; VI-SAFE-LABEL: v_fneg_add_multi_use_add_f16: 126; VI-SAFE: ; %bb.0: 127; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 128; VI-SAFE-NEXT: v_add_f16_e32 v1, v0, v1 129; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 130; VI-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1 131; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 132; 133; VI-NSZ-LABEL: v_fneg_add_multi_use_add_f16: 134; VI-NSZ: ; %bb.0: 135; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 136; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1 137; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0 138; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 139; 140; GFX11-SAFE-LABEL: v_fneg_add_multi_use_add_f16: 141; GFX11-SAFE: ; %bb.0: 142; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 143; GFX11-SAFE-NEXT: v_add_f16_e32 v1, v0, v1 144; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 145; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 146; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1 147; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 148; 149; GFX11-NSZ-LABEL: v_fneg_add_multi_use_add_f16: 150; GFX11-NSZ: ; %bb.0: 151; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1 153; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 154; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0 155; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 156 %add = fadd half %a, %b 157 %fneg = fneg half %add 158 %use1 = fmul half %add, 4.0 159 160 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 161 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 162 ret { half, half } %insert.1 163} 164 165define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 { 166; SI-SAFE-LABEL: v_fneg_add_fneg_x_f16: 167; SI-SAFE: ; %bb.0: 168; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 169; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 170; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 171; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 172; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 173; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 174; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 175; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 176; 177; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16: 178; SI-NSZ: ; %bb.0: 179; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 181; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 182; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 183; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 184; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 185; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 186; 187; VI-SAFE-LABEL: v_fneg_add_fneg_x_f16: 188; VI-SAFE: ; %bb.0: 189; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; VI-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0 191; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 192; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 193; 194; VI-NSZ-LABEL: v_fneg_add_fneg_x_f16: 195; VI-NSZ: ; %bb.0: 196; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 197; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1 198; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX11-SAFE-LABEL: v_fneg_add_fneg_x_f16: 201; GFX11-SAFE: ; %bb.0: 202; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX11-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0 204; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 205; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 206; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 207; 208; GFX11-NSZ-LABEL: v_fneg_add_fneg_x_f16: 209; GFX11-NSZ: ; %bb.0: 210; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1 212; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 213 %fneg.a = fneg half %a 214 %add = fadd half %fneg.a, %b 215 %fneg = fneg half %add 216 ret half %fneg 217} 218 219define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 { 220; SI-SAFE-LABEL: v_fneg_add_x_fneg_f16: 221; SI-SAFE: ; %bb.0: 222; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 223; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 224; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 225; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 226; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 227; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 228; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 229; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 230; 231; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16: 232; SI-NSZ: ; %bb.0: 233; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 235; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 236; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 237; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 238; SI-NSZ-NEXT: v_sub_f32_e32 v0, v1, v0 239; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 240; 241; VI-SAFE-LABEL: v_fneg_add_x_fneg_f16: 242; VI-SAFE: ; %bb.0: 243; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 244; VI-SAFE-NEXT: v_sub_f16_e32 v0, v0, v1 245; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 246; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 247; 248; VI-NSZ-LABEL: v_fneg_add_x_fneg_f16: 249; VI-NSZ: ; %bb.0: 250; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; VI-NSZ-NEXT: v_sub_f16_e32 v0, v1, v0 252; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 253; 254; GFX11-SAFE-LABEL: v_fneg_add_x_fneg_f16: 255; GFX11-SAFE: ; %bb.0: 256; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 257; GFX11-SAFE-NEXT: v_sub_f16_e32 v0, v0, v1 258; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 259; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 260; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 261; 262; GFX11-NSZ-LABEL: v_fneg_add_x_fneg_f16: 263; GFX11-NSZ: ; %bb.0: 264; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, v1, v0 266; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 267 %fneg.b = fneg half %b 268 %add = fadd half %a, %fneg.b 269 %fneg = fneg half %add 270 ret half %fneg 271} 272 273define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 { 274; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: 275; SI-SAFE: ; %bb.0: 276; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 278; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 279; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 280; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 281; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 282; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 283; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 284; 285; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16: 286; SI-NSZ: ; %bb.0: 287; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 288; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 289; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 290; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 291; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 292; SI-NSZ-NEXT: v_add_f32_e32 v0, v0, v1 293; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 294; 295; VI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: 296; VI-SAFE: ; %bb.0: 297; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 298; VI-SAFE-NEXT: v_sub_f16_e64 v0, -v0, v1 299; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 300; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 301; 302; VI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16: 303; VI-NSZ: ; %bb.0: 304; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; VI-NSZ-NEXT: v_add_f16_e32 v0, v0, v1 306; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 307; 308; GFX11-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: 309; GFX11-SAFE: ; %bb.0: 310; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 311; GFX11-SAFE-NEXT: v_sub_f16_e64 v0, -v0, v1 312; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 313; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 314; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 315; 316; GFX11-NSZ-LABEL: v_fneg_add_fneg_fneg_f16: 317; GFX11-NSZ: ; %bb.0: 318; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GFX11-NSZ-NEXT: v_add_f16_e32 v0, v0, v1 320; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 321 %fneg.a = fneg half %a 322 %fneg.b = fneg half %b 323 %add = fadd half %fneg.a, %fneg.b 324 %fneg = fneg half %add 325 ret half %fneg 326} 327 328define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 { 329; SI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16: 330; SI-SAFE: ; %bb.0: 331; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 332; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 333; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 334; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v0 335; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 336; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v1, -v0 337; SI-SAFE-NEXT: v_sub_f32_e32 v0, v3, v2 338; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 339; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 340; 341; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16: 342; SI-NSZ: ; %bb.0: 343; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 345; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 346; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v1 347; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 348; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 349; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v2 350; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 351; 352; VI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16: 353; VI-SAFE: ; %bb.0: 354; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 355; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0 356; VI-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0 357; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 358; VI-SAFE-NEXT: v_mov_b32_e32 v1, v2 359; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 360; 361; VI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16: 362; VI-NSZ: ; %bb.0: 363; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; VI-NSZ-NEXT: v_xor_b32_e32 v2, 0x8000, v0 365; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1 366; VI-NSZ-NEXT: v_mov_b32_e32 v1, v2 367; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 368; 369; GFX11-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16: 370; GFX11-SAFE: ; %bb.0: 371; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 372; GFX11-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0 373; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 374; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1 375; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 376; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2 377; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 378; 379; GFX11-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16: 380; GFX11-NSZ: ; %bb.0: 381; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 382; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 383; GFX11-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 384; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) 385; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2 386; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 387 %fneg.a = fneg half %a 388 %add = fadd half %fneg.a, %b 389 %fneg = fneg half %add 390 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 391 %insert.1 = insertvalue { half, half } %insert.0, half %fneg.a, 1 392 ret { half, half } %insert.1 393} 394 395define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c) #0 { 396; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: 397; SI-SAFE: ; %bb.0: 398; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 399; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 400; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 401; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 402; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v0 403; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 404; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 405; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0 406; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v3 407; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 408; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v2 409; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 410; 411; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16: 412; SI-NSZ: ; %bb.0: 413; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 415; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 416; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 417; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 418; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 419; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 420; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v0 421; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v1 422; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v2 423; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 424; 425; VI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: 426; VI-SAFE: ; %bb.0: 427; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 428; VI-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0 429; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v1 430; VI-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v2 431; VI-SAFE-NEXT: v_mov_b32_e32 v0, v3 432; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 433; 434; VI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16: 435; VI-NSZ: ; %bb.0: 436; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 437; VI-NSZ-NEXT: v_sub_f16_e32 v3, v0, v1 438; VI-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v2 439; VI-NSZ-NEXT: v_mov_b32_e32 v0, v3 440; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 441; 442; GFX11-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: 443; GFX11-SAFE: ; %bb.0: 444; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 445; GFX11-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0 446; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 447; GFX11-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v1 448; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v2 449; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v3 450; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 451; 452; GFX11-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16: 453; GFX11-NSZ: ; %bb.0: 454; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 455; GFX11-NSZ-NEXT: v_sub_f16_e32 v3, v0, v1 456; GFX11-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v2 457; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) 458; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v3 459; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 460 %fneg.a = fneg half %a 461 %add = fadd half %fneg.a, %b 462 %fneg = fneg half %add 463 %use1 = fmul half %fneg.a, %c 464 465 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 466 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 467 ret { half, half } %insert.1 468} 469 470; This one asserted with -enable-no-signed-zeros-fp-math 471define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 { 472; SI-SAFE-LABEL: fneg_fadd_0_f16: 473; SI-SAFE: ; %bb.0: ; %.entry 474; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 475; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1 476; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0 477; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 478; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 479; SI-SAFE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 480; SI-SAFE-NEXT: v_rcp_f32_e32 v3, v2 481; SI-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 482; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 483; SI-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 484; SI-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 485; SI-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 486; SI-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 487; SI-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 488; SI-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 489; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 490; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 491; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 492; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 493; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 494; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc 495; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 496; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 497; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 498; SI-SAFE-NEXT: ; return to shader part epilog 499; 500; SI-NSZ-LABEL: fneg_fadd_0_f16: 501; SI-NSZ: ; %bb.0: ; %.entry 502; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 503; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 504; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 505; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 506; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 507; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 508; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2 509; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 510; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 511; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 512; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3 513; SI-NSZ-NEXT: v_mul_f32_e32 v5, v4, v3 514; SI-NSZ-NEXT: v_fma_f32 v6, -v2, v5, v4 515; SI-NSZ-NEXT: v_fma_f32 v5, v6, v3, v5 516; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4 517; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 518; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 519; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 520; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 521; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 522; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 523; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 524; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 525; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 526; SI-NSZ-NEXT: ; return to shader part epilog 527; 528; VI-SAFE-LABEL: fneg_fadd_0_f16: 529; VI-SAFE: ; %bb.0: ; %.entry 530; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1 531; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 532; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 533; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 534; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0 535; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0 536; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 537; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 538; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 539; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 540; VI-SAFE-NEXT: ; return to shader part epilog 541; 542; VI-NSZ-LABEL: fneg_fadd_0_f16: 543; VI-NSZ: ; %bb.0: ; %.entry 544; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 545; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 546; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 547; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 548; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 549; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 550; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 551; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 552; VI-NSZ-NEXT: ; return to shader part epilog 553; 554; GFX11-SAFE-LABEL: fneg_fadd_0_f16: 555; GFX11-SAFE: ; %bb.0: ; %.entry 556; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1 557; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff 558; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 559; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 560; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 561; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 562; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 563; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 564; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo 565; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 566; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo 567; GFX11-SAFE-NEXT: ; return to shader part epilog 568; 569; GFX11-NSZ-LABEL: fneg_fadd_0_f16: 570; GFX11-NSZ: ; %bb.0: ; %.entry 571; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 572; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff 573; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 574; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 575; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 576; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 577; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 578; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 579; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo 580; GFX11-NSZ-NEXT: ; return to shader part epilog 581.entry: 582 %tmp7 = fdiv half 1.000000e+00, %tmp6 583 %tmp8 = fmul half 0.000000e+00, %tmp7 584 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8 585 %.i188 = fadd half %tmp9, 0.000000e+00 586 %tmp10 = fcmp uge half %.i188, %tmp2 587 %tmp11 = fneg half %.i188 588 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11 589 %tmp12 = fcmp ule half %.i092, 0.000000e+00 590 %.i198 = select i1 %tmp12, half 0.000000e+00, half 0x7FF8000000000000 591 ret half %.i198 592} 593 594; This is a workaround because -enable-no-signed-zeros-fp-math does not set up 595; function attribute unsafe-fp-math automatically. Combine with the previous test 596; when that is done. 597define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 { 598; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16: 599; SI-SAFE: ; %bb.0: ; %.entry 600; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0 601; SI-SAFE-NEXT: s_brev_b32 s0, 1 602; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 603; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 604; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0 605; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 606; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 607; SI-SAFE-NEXT: ; return to shader part epilog 608; 609; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16: 610; SI-NSZ: ; %bb.0: ; %.entry 611; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 612; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 613; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 614; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 615; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 616; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 617; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 618; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 619; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 620; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 621; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 622; SI-NSZ-NEXT: ; return to shader part epilog 623; 624; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16: 625; VI-SAFE: ; %bb.0: ; %.entry 626; VI-SAFE-NEXT: v_mov_b32_e32 v0, 0x8000 627; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 628; VI-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc, s0, 0 629; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 630; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 631; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 632; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 633; VI-SAFE-NEXT: ; return to shader part epilog 634; 635; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16: 636; VI-NSZ: ; %bb.0: ; %.entry 637; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 638; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 639; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 640; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 641; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 642; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 643; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 644; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 645; VI-NSZ-NEXT: ; return to shader part epilog 646; 647; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16: 648; GFX11-SAFE: ; %bb.0: ; %.entry 649; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0 650; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0 651; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 652; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo 653; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 654; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) 655; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo 656; GFX11-SAFE-NEXT: ; return to shader part epilog 657; 658; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16: 659; GFX11-NSZ: ; %bb.0: ; %.entry 660; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 661; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff 662; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 663; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 664; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 665; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 666; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 667; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 668; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo 669; GFX11-NSZ-NEXT: ; return to shader part epilog 670.entry: 671 %tmp7 = fdiv afn half 1.000000e+00, %tmp6 672 %tmp8 = fmul half 0.000000e+00, %tmp7 673 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8 674 %.i188 = fadd half %tmp9, 0.000000e+00 675 %tmp10 = fcmp uge half %.i188, %tmp2 676 %tmp11 = fneg half %.i188 677 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11 678 %tmp12 = fcmp ule half %.i092, 0.000000e+00 679 %.i198 = select i1 %tmp12, half 0.000000e+00, half 0x7FF8000000000000 680 ret half %.i198 681} 682 683; -------------------------------------------------------------------------------- 684; fmul tests 685; -------------------------------------------------------------------------------- 686 687define half @v_fneg_mul_f16(half %a, half %b) #0 { 688; SI-LABEL: v_fneg_mul_f16: 689; SI: ; %bb.0: 690; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 691; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 692; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 693; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 694; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 695; SI-NEXT: v_mul_f32_e32 v0, v0, v1 696; SI-NEXT: s_setpc_b64 s[30:31] 697; 698; VI-LABEL: v_fneg_mul_f16: 699; VI: ; %bb.0: 700; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 701; VI-NEXT: v_mul_f16_e64 v0, v0, -v1 702; VI-NEXT: s_setpc_b64 s[30:31] 703; 704; GFX11-LABEL: v_fneg_mul_f16: 705; GFX11: ; %bb.0: 706; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 707; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1 708; GFX11-NEXT: s_setpc_b64 s[30:31] 709 %mul = fmul half %a, %b 710 %fneg = fneg half %mul 711 ret half %fneg 712} 713 714define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 { 715; SI-LABEL: v_fneg_mul_store_use_mul_f16: 716; SI: ; %bb.0: 717; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 718; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 719; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 720; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 721; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 722; SI-NEXT: v_mul_f32_e32 v1, v0, v1 723; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 724; SI-NEXT: s_setpc_b64 s[30:31] 725; 726; VI-LABEL: v_fneg_mul_store_use_mul_f16: 727; VI: ; %bb.0: 728; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 729; VI-NEXT: v_mul_f16_e32 v1, v0, v1 730; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 731; VI-NEXT: s_setpc_b64 s[30:31] 732; 733; GFX11-LABEL: v_fneg_mul_store_use_mul_f16: 734; GFX11: ; %bb.0: 735; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 736; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1 737; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 738; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 739; GFX11-NEXT: s_setpc_b64 s[30:31] 740 %mul = fmul half %a, %b 741 %fneg = fneg half %mul 742 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 743 %insert.1 = insertvalue { half, half } %insert.0, half %mul, 1 744 ret { half, half } %insert.1 745} 746 747define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 { 748; SI-LABEL: v_fneg_mul_multi_use_mul_f16: 749; SI: ; %bb.0: 750; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 751; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 752; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 753; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 754; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 755; SI-NEXT: v_mul_f32_e32 v0, v0, v1 756; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 757; SI-NEXT: s_setpc_b64 s[30:31] 758; 759; VI-LABEL: v_fneg_mul_multi_use_mul_f16: 760; VI: ; %bb.0: 761; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 762; VI-NEXT: v_mul_f16_e64 v0, v0, -v1 763; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0 764; VI-NEXT: s_setpc_b64 s[30:31] 765; 766; GFX11-LABEL: v_fneg_mul_multi_use_mul_f16: 767; GFX11: ; %bb.0: 768; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 769; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1 770; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 771; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0 772; GFX11-NEXT: s_setpc_b64 s[30:31] 773 %mul = fmul half %a, %b 774 %fneg = fneg half %mul 775 %use1 = fmul half %mul, 4.0 776 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 777 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 778 ret { half, half } %insert.1 779} 780 781define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 { 782; SI-LABEL: v_fneg_mul_fneg_x_f16: 783; SI: ; %bb.0: 784; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 785; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 786; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 787; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 788; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 789; SI-NEXT: v_mul_f32_e32 v0, v0, v1 790; SI-NEXT: s_setpc_b64 s[30:31] 791; 792; VI-LABEL: v_fneg_mul_fneg_x_f16: 793; VI: ; %bb.0: 794; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 795; VI-NEXT: v_mul_f16_e32 v0, v0, v1 796; VI-NEXT: s_setpc_b64 s[30:31] 797; 798; GFX11-LABEL: v_fneg_mul_fneg_x_f16: 799; GFX11: ; %bb.0: 800; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 801; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 802; GFX11-NEXT: s_setpc_b64 s[30:31] 803 %fneg.a = fneg half %a 804 %mul = fmul half %fneg.a, %b 805 %fneg = fneg half %mul 806 ret half %fneg 807} 808 809define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 { 810; SI-LABEL: v_fneg_mul_x_fneg_f16: 811; SI: ; %bb.0: 812; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 813; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 814; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 815; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 816; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 817; SI-NEXT: v_mul_f32_e32 v0, v0, v1 818; SI-NEXT: s_setpc_b64 s[30:31] 819; 820; VI-LABEL: v_fneg_mul_x_fneg_f16: 821; VI: ; %bb.0: 822; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 823; VI-NEXT: v_mul_f16_e32 v0, v0, v1 824; VI-NEXT: s_setpc_b64 s[30:31] 825; 826; GFX11-LABEL: v_fneg_mul_x_fneg_f16: 827; GFX11: ; %bb.0: 828; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 829; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 830; GFX11-NEXT: s_setpc_b64 s[30:31] 831 %fneg.b = fneg half %b 832 %mul = fmul half %a, %fneg.b 833 %fneg = fneg half %mul 834 ret half %fneg 835} 836 837define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 { 838; SI-LABEL: v_fneg_mul_fneg_fneg_f16: 839; SI: ; %bb.0: 840; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 841; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 842; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 843; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 844; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 845; SI-NEXT: v_mul_f32_e32 v0, v0, v1 846; SI-NEXT: s_setpc_b64 s[30:31] 847; 848; VI-LABEL: v_fneg_mul_fneg_fneg_f16: 849; VI: ; %bb.0: 850; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 851; VI-NEXT: v_mul_f16_e64 v0, v0, -v1 852; VI-NEXT: s_setpc_b64 s[30:31] 853; 854; GFX11-LABEL: v_fneg_mul_fneg_fneg_f16: 855; GFX11: ; %bb.0: 856; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 857; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1 858; GFX11-NEXT: s_setpc_b64 s[30:31] 859 %fneg.a = fneg half %a 860 %fneg.b = fneg half %b 861 %mul = fmul half %fneg.a, %fneg.b 862 %fneg = fneg half %mul 863 ret half %fneg 864} 865 866define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 { 867; SI-LABEL: v_fneg_mul_store_use_fneg_x_f16: 868; SI: ; %bb.0: 869; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 870; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 871; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 872; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 873; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 874; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 875; SI-NEXT: v_mul_f32_e32 v0, v3, v2 876; SI-NEXT: s_setpc_b64 s[30:31] 877; 878; VI-LABEL: v_fneg_mul_store_use_fneg_x_f16: 879; VI: ; %bb.0: 880; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 881; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0 882; VI-NEXT: v_mul_f16_e32 v0, v0, v1 883; VI-NEXT: v_mov_b32_e32 v1, v2 884; VI-NEXT: s_setpc_b64 s[30:31] 885; 886; GFX11-LABEL: v_fneg_mul_store_use_fneg_x_f16: 887; GFX11: ; %bb.0: 888; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 889; GFX11-NEXT: v_mul_f16_e32 v2, v0, v1 890; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0 891; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 892; GFX11-NEXT: v_mov_b32_e32 v0, v2 893; GFX11-NEXT: s_setpc_b64 s[30:31] 894 %fneg.a = fneg half %a 895 %mul = fmul half %fneg.a, %b 896 %fneg = fneg half %mul 897 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 898 %insert.1 = insertvalue { half, half } %insert.0, half %fneg.a, 1 899 ret { half, half } %insert.1 900} 901 902define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c) #0 { 903; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16: 904; SI: ; %bb.0: 905; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 906; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 907; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 908; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 909; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 910; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 911; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 912; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 913; SI-NEXT: v_mul_f32_e32 v0, v3, v1 914; SI-NEXT: v_mul_f32_e32 v1, v4, v2 915; SI-NEXT: s_setpc_b64 s[30:31] 916; 917; VI-LABEL: v_fneg_mul_multi_use_fneg_x_f16: 918; VI: ; %bb.0: 919; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 920; VI-NEXT: v_mul_f16_e32 v3, v0, v1 921; VI-NEXT: v_mul_f16_e64 v1, -v0, v2 922; VI-NEXT: v_mov_b32_e32 v0, v3 923; VI-NEXT: s_setpc_b64 s[30:31] 924; 925; GFX11-LABEL: v_fneg_mul_multi_use_fneg_x_f16: 926; GFX11: ; %bb.0: 927; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 928; GFX11-NEXT: v_mul_f16_e32 v3, v0, v1 929; GFX11-NEXT: v_mul_f16_e64 v1, -v0, v2 930; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 931; GFX11-NEXT: v_mov_b32_e32 v0, v3 932; GFX11-NEXT: s_setpc_b64 s[30:31] 933 %fneg.a = fneg half %a 934 %mul = fmul half %fneg.a, %b 935 %fneg = fneg half %mul 936 %use1 = fmul half %fneg.a, %c 937 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 938 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 939 ret { half, half } %insert.1 940} 941 942; -------------------------------------------------------------------------------- 943; fminnum tests 944; -------------------------------------------------------------------------------- 945 946define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { 947; SI-LABEL: v_fneg_minnum_f16_ieee: 948; SI: ; %bb.0: 949; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 950; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 951; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 952; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 953; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 954; SI-NEXT: v_max_f32_e32 v0, v0, v1 955; SI-NEXT: s_setpc_b64 s[30:31] 956; 957; VI-LABEL: v_fneg_minnum_f16_ieee: 958; VI: ; %bb.0: 959; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 960; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 961; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 962; VI-NEXT: v_max_f16_e32 v0, v0, v1 963; VI-NEXT: s_setpc_b64 s[30:31] 964; 965; GFX11-LABEL: v_fneg_minnum_f16_ieee: 966; GFX11: ; %bb.0: 967; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 968; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 969; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 970; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 971; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 972; GFX11-NEXT: s_setpc_b64 s[30:31] 973 %min = call half @llvm.minnum.f16(half %a, half %b) 974 %fneg = fneg half %min 975 ret half %fneg 976} 977 978define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 { 979; SI-LABEL: v_fneg_minnum_f16_no_ieee: 980; SI: ; %bb.0: 981; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 982; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 983; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 984; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 985; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 986; SI-NEXT: v_max_f32_e32 v0, v0, v1 987; SI-NEXT: s_setpc_b64 s[30:31] 988; 989; VI-LABEL: v_fneg_minnum_f16_no_ieee: 990; VI: ; %bb.0: 991; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 992; VI-NEXT: v_max_f16_e64 v0, -v0, -v1 993; VI-NEXT: s_setpc_b64 s[30:31] 994; 995; GFX11-LABEL: v_fneg_minnum_f16_no_ieee: 996; GFX11: ; %bb.0: 997; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 998; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v1 999; GFX11-NEXT: s_setpc_b64 s[30:31] 1000 %min = call half @llvm.minnum.f16(half %a, half %b) 1001 %fneg = fneg half %min 1002 ret half %fneg 1003} 1004 1005define half @v_fneg_self_minnum_f16_ieee(half %a) #0 { 1006; SI-LABEL: v_fneg_self_minnum_f16_ieee: 1007; SI: ; %bb.0: 1008; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1009; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1010; SI-NEXT: s_setpc_b64 s[30:31] 1011; 1012; VI-LABEL: v_fneg_self_minnum_f16_ieee: 1013; VI: ; %bb.0: 1014; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1015; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1016; VI-NEXT: s_setpc_b64 s[30:31] 1017; 1018; GFX11-LABEL: v_fneg_self_minnum_f16_ieee: 1019; GFX11: ; %bb.0: 1020; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1021; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1022; GFX11-NEXT: s_setpc_b64 s[30:31] 1023 %min = call half @llvm.minnum.f16(half %a, half %a) 1024 %min.fneg = fneg half %min 1025 ret half %min.fneg 1026} 1027 1028define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 { 1029; SI-LABEL: v_fneg_self_minnum_f16_no_ieee: 1030; SI: ; %bb.0: 1031; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1032; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1033; SI-NEXT: s_setpc_b64 s[30:31] 1034; 1035; VI-LABEL: v_fneg_self_minnum_f16_no_ieee: 1036; VI: ; %bb.0: 1037; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1038; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1039; VI-NEXT: s_setpc_b64 s[30:31] 1040; 1041; GFX11-LABEL: v_fneg_self_minnum_f16_no_ieee: 1042; GFX11: ; %bb.0: 1043; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1044; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1045; GFX11-NEXT: s_setpc_b64 s[30:31] 1046 %min = call half @llvm.minnum.f16(half %a, half %a) 1047 %min.fneg = fneg half %min 1048 ret half %min.fneg 1049} 1050 1051define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { 1052; SI-LABEL: v_fneg_posk_minnum_f16_ieee: 1053; SI: ; %bb.0: 1054; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1055; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1056; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1057; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 1058; SI-NEXT: s_setpc_b64 s[30:31] 1059; 1060; VI-LABEL: v_fneg_posk_minnum_f16_ieee: 1061; VI: ; %bb.0: 1062; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1063; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1064; VI-NEXT: v_max_f16_e32 v0, -4.0, v0 1065; VI-NEXT: s_setpc_b64 s[30:31] 1066; 1067; GFX11-LABEL: v_fneg_posk_minnum_f16_ieee: 1068; GFX11: ; %bb.0: 1069; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1070; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1071; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1072; GFX11-NEXT: v_max_f16_e32 v0, -4.0, v0 1073; GFX11-NEXT: s_setpc_b64 s[30:31] 1074 %min = call half @llvm.minnum.f16(half 4.0, half %a) 1075 %fneg = fneg half %min 1076 ret half %fneg 1077} 1078 1079define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 { 1080; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee: 1081; SI: ; %bb.0: 1082; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1083; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1084; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1085; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 1086; SI-NEXT: s_setpc_b64 s[30:31] 1087; 1088; VI-LABEL: v_fneg_posk_minnum_f16_no_ieee: 1089; VI: ; %bb.0: 1090; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1091; VI-NEXT: v_max_f16_e64 v0, -v0, -4.0 1092; VI-NEXT: s_setpc_b64 s[30:31] 1093; 1094; GFX11-LABEL: v_fneg_posk_minnum_f16_no_ieee: 1095; GFX11: ; %bb.0: 1096; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1097; GFX11-NEXT: v_max_f16_e64 v0, -v0, -4.0 1098; GFX11-NEXT: s_setpc_b64 s[30:31] 1099 %min = call half @llvm.minnum.f16(half 4.0, half %a) 1100 %fneg = fneg half %min 1101 ret half %fneg 1102} 1103 1104define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { 1105; SI-LABEL: v_fneg_negk_minnum_f16_ieee: 1106; SI: ; %bb.0: 1107; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1108; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1109; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1110; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 1111; SI-NEXT: s_setpc_b64 s[30:31] 1112; 1113; VI-LABEL: v_fneg_negk_minnum_f16_ieee: 1114; VI: ; %bb.0: 1115; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1116; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1117; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 1118; VI-NEXT: s_setpc_b64 s[30:31] 1119; 1120; GFX11-LABEL: v_fneg_negk_minnum_f16_ieee: 1121; GFX11: ; %bb.0: 1122; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1123; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1124; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1125; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 1126; GFX11-NEXT: s_setpc_b64 s[30:31] 1127 %min = call half @llvm.minnum.f16(half -4.0, half %a) 1128 %fneg = fneg half %min 1129 ret half %fneg 1130} 1131 1132define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 { 1133; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee: 1134; SI: ; %bb.0: 1135; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1136; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1137; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1138; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 1139; SI-NEXT: s_setpc_b64 s[30:31] 1140; 1141; VI-LABEL: v_fneg_negk_minnum_f16_no_ieee: 1142; VI: ; %bb.0: 1143; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1144; VI-NEXT: v_max_f16_e64 v0, -v0, 4.0 1145; VI-NEXT: s_setpc_b64 s[30:31] 1146; 1147; GFX11-LABEL: v_fneg_negk_minnum_f16_no_ieee: 1148; GFX11: ; %bb.0: 1149; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1150; GFX11-NEXT: v_max_f16_e64 v0, -v0, 4.0 1151; GFX11-NEXT: s_setpc_b64 s[30:31] 1152 %min = call half @llvm.minnum.f16(half -4.0, half %a) 1153 %fneg = fneg half %min 1154 ret half %fneg 1155} 1156 1157define half @v_fneg_0_minnum_f16(half %a) #0 { 1158; SI-LABEL: v_fneg_0_minnum_f16: 1159; SI: ; %bb.0: 1160; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1161; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1162; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1163; SI-NEXT: v_min_f32_e32 v0, 0, v0 1164; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1165; SI-NEXT: s_setpc_b64 s[30:31] 1166; 1167; VI-LABEL: v_fneg_0_minnum_f16: 1168; VI: ; %bb.0: 1169; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1170; VI-NEXT: v_min_f16_e32 v0, 0, v0 1171; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1172; VI-NEXT: s_setpc_b64 s[30:31] 1173; 1174; GFX11-LABEL: v_fneg_0_minnum_f16: 1175; GFX11: ; %bb.0: 1176; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1177; GFX11-NEXT: v_min_f16_e32 v0, 0, v0 1178; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1179; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1180; GFX11-NEXT: s_setpc_b64 s[30:31] 1181 %min = call nnan half @llvm.minnum.f16(half 0.0, half %a) 1182 %fneg = fneg half %min 1183 ret half %fneg 1184} 1185 1186define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { 1187; SI-LABEL: v_fneg_neg0_minnum_f16_ieee: 1188; SI: ; %bb.0: 1189; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1190; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1191; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1192; SI-NEXT: v_max_f32_e32 v0, 0, v0 1193; SI-NEXT: s_setpc_b64 s[30:31] 1194; 1195; VI-LABEL: v_fneg_neg0_minnum_f16_ieee: 1196; VI: ; %bb.0: 1197; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1198; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1199; VI-NEXT: v_max_f16_e32 v0, 0, v0 1200; VI-NEXT: s_setpc_b64 s[30:31] 1201; 1202; GFX11-LABEL: v_fneg_neg0_minnum_f16_ieee: 1203; GFX11: ; %bb.0: 1204; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1205; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1206; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1207; GFX11-NEXT: v_max_f16_e32 v0, 0, v0 1208; GFX11-NEXT: s_setpc_b64 s[30:31] 1209 %min = call half @llvm.minnum.f16(half -0.0, half %a) 1210 %fneg = fneg half %min 1211 ret half %fneg 1212} 1213 1214define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { 1215; SI-LABEL: v_fneg_inv2pi_minnum_f16: 1216; SI: ; %bb.0: 1217; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1218; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1219; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1220; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 1221; SI-NEXT: s_setpc_b64 s[30:31] 1222; 1223; VI-LABEL: v_fneg_inv2pi_minnum_f16: 1224; VI: ; %bb.0: 1225; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1226; VI-NEXT: v_max_f16_e32 v0, v0, v0 1227; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 1228; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1229; VI-NEXT: s_setpc_b64 s[30:31] 1230; 1231; GFX11-LABEL: v_fneg_inv2pi_minnum_f16: 1232; GFX11: ; %bb.0: 1233; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1234; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 1235; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1236; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0 1237; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1238; GFX11-NEXT: s_setpc_b64 s[30:31] 1239 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 1240 %fneg = fneg half %min 1241 ret half %fneg 1242} 1243 1244define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { 1245; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: 1246; SI: ; %bb.0: 1247; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1248; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1249; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1250; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 1251; SI-NEXT: s_setpc_b64 s[30:31] 1252; 1253; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16: 1254; VI: ; %bb.0: 1255; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1256; VI-NEXT: v_max_f16_e32 v0, v0, v0 1257; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 1258; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1259; VI-NEXT: s_setpc_b64 s[30:31] 1260; 1261; GFX11-LABEL: v_fneg_neg_inv2pi_minnum_f16: 1262; GFX11: ; %bb.0: 1263; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1264; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 1265; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1266; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0 1267; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1268; GFX11-NEXT: s_setpc_b64 s[30:31] 1269 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 1270 %fneg = fneg half %min 1271 ret half %fneg 1272} 1273 1274define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 { 1275; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee: 1276; SI: ; %bb.0: 1277; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1278; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1279; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1280; SI-NEXT: v_max_f32_e32 v0, 0, v0 1281; SI-NEXT: s_setpc_b64 s[30:31] 1282; 1283; VI-LABEL: v_fneg_neg0_minnum_f16_no_ieee: 1284; VI: ; %bb.0: 1285; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1286; VI-NEXT: v_max_f16_e64 v0, -v0, 0 1287; VI-NEXT: s_setpc_b64 s[30:31] 1288; 1289; GFX11-LABEL: v_fneg_neg0_minnum_f16_no_ieee: 1290; GFX11: ; %bb.0: 1291; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1292; GFX11-NEXT: v_max_f16_e64 v0, -v0, 0 1293; GFX11-NEXT: s_setpc_b64 s[30:31] 1294 %min = call half @llvm.minnum.f16(half -0.0, half %a) 1295 %fneg = fneg half %min 1296 ret half %fneg 1297} 1298 1299define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { 1300; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: 1301; SI: ; %bb.0: 1302; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1303; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1304; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1305; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1306; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1307; SI-NEXT: v_min_f32_e32 v0, 0, v0 1308; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 1309; SI-NEXT: s_setpc_b64 s[30:31] 1310; 1311; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: 1312; VI: ; %bb.0: 1313; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1314; VI-NEXT: v_max_f16_e32 v0, v0, v0 1315; VI-NEXT: v_min_f16_e32 v0, 0, v0 1316; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 1317; VI-NEXT: s_setpc_b64 s[30:31] 1318; 1319; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: 1320; GFX11: ; %bb.0: 1321; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1322; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 1323; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1324; GFX11-NEXT: v_min_f16_e32 v0, 0, v0 1325; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 1326; GFX11-NEXT: s_setpc_b64 s[30:31] 1327 %min = call half @llvm.minnum.f16(half 0.0, half %a) 1328 %fneg = fneg half %min 1329 %mul = fmul half %fneg, %b 1330 ret half %mul 1331} 1332 1333define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { 1334; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: 1335; SI: ; %bb.0: 1336; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1337; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1338; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1339; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1340; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1341; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 1342; SI-NEXT: v_mul_f32_e32 v0, v0, v1 1343; SI-NEXT: s_setpc_b64 s[30:31] 1344; 1345; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: 1346; VI: ; %bb.0: 1347; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1348; VI-NEXT: v_max_f16_e32 v0, v0, v0 1349; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 1350; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 1351; VI-NEXT: s_setpc_b64 s[30:31] 1352; 1353; GFX11-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: 1354; GFX11: ; %bb.0: 1355; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1356; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 1357; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1358; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0 1359; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 1360; GFX11-NEXT: s_setpc_b64 s[30:31] 1361 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 1362 %fneg = fneg half %min 1363 %mul = fmul half %fneg, %b 1364 ret half %mul 1365} 1366 1367define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { 1368; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee: 1369; SI: ; %bb.0: 1370; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1371; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1372; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1373; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1374; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1375; SI-NEXT: v_min_f32_e32 v0, 0, v0 1376; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 1377; SI-NEXT: s_setpc_b64 s[30:31] 1378; 1379; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee: 1380; VI: ; %bb.0: 1381; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1382; VI-NEXT: v_min_f16_e32 v0, 0, v0 1383; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 1384; VI-NEXT: s_setpc_b64 s[30:31] 1385; 1386; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee: 1387; GFX11: ; %bb.0: 1388; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1389; GFX11-NEXT: v_min_f16_e32 v0, 0, v0 1390; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1391; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 1392; GFX11-NEXT: s_setpc_b64 s[30:31] 1393 %min = call half @llvm.minnum.f16(half 0.0, half %a) 1394 %fneg = fneg half %min 1395 %mul = fmul half %fneg, %b 1396 ret half %mul 1397} 1398 1399define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) #0 { 1400; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: 1401; SI: ; %bb.0: 1402; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1403; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1404; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1405; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 1406; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 1407; SI-NEXT: v_max_f32_e32 v0, v0, v1 1408; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 1409; SI-NEXT: s_setpc_b64 s[30:31] 1410; 1411; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: 1412; VI: ; %bb.0: 1413; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1414; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 1415; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1416; VI-NEXT: v_max_f16_e32 v0, v0, v1 1417; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0 1418; VI-NEXT: s_setpc_b64 s[30:31] 1419; 1420; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: 1421; GFX11: ; %bb.0: 1422; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1423; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 1424; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1425; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1426; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 1427; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0 1428; GFX11-NEXT: s_setpc_b64 s[30:31] 1429 %min = call half @llvm.minnum.f16(half %a, half %b) 1430 %fneg = fneg half %min 1431 %use1 = fmul half %min, 4.0 1432 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 1433 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 1434 ret { half, half } %insert.1 1435} 1436 1437define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) #4 { 1438; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: 1439; SI: ; %bb.0: 1440; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1441; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1442; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1443; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 1444; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 1445; SI-NEXT: v_max_f32_e32 v0, v0, v1 1446; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 1447; SI-NEXT: s_setpc_b64 s[30:31] 1448; 1449; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: 1450; VI: ; %bb.0: 1451; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1452; VI-NEXT: v_max_f16_e64 v0, -v0, -v1 1453; VI-NEXT: v_mov_b32_e32 v1, 0xc400 1454; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1455; VI-NEXT: v_or_b32_e32 v0, v0, v1 1456; VI-NEXT: s_setpc_b64 s[30:31] 1457; 1458; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: 1459; GFX11: ; %bb.0: 1460; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1461; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 1462; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1463; GFX11-NEXT: v_mul_f16_e32 v1, 4.0, v0 1464; GFX11-NEXT: v_pack_b32_f16 v0, -v0, v1 1465; GFX11-NEXT: s_setpc_b64 s[30:31] 1466 %min = call half @llvm.minnum.f16(half %a, half %b) 1467 %fneg = fneg half %min 1468 %use1 = fmul half %min, 4.0 1469 %ins0 = insertelement <2 x half> undef, half %fneg, i32 0 1470 %ins1 = insertelement <2 x half> %ins0, half %use1, i32 1 1471 ret <2 x half> %ins1 1472} 1473 1474; -------------------------------------------------------------------------------- 1475; fmaxnum tests 1476; -------------------------------------------------------------------------------- 1477 1478define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { 1479; SI-LABEL: v_fneg_maxnum_f16_ieee: 1480; SI: ; %bb.0: 1481; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1482; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 1483; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1484; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1485; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1486; SI-NEXT: v_min_f32_e32 v0, v0, v1 1487; SI-NEXT: s_setpc_b64 s[30:31] 1488; 1489; VI-LABEL: v_fneg_maxnum_f16_ieee: 1490; VI: ; %bb.0: 1491; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1492; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 1493; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1494; VI-NEXT: v_min_f16_e32 v0, v0, v1 1495; VI-NEXT: s_setpc_b64 s[30:31] 1496; 1497; GFX11-LABEL: v_fneg_maxnum_f16_ieee: 1498; GFX11: ; %bb.0: 1499; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1500; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 1501; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1502; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1503; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 1504; GFX11-NEXT: s_setpc_b64 s[30:31] 1505 %max = call half @llvm.maxnum.f16(half %a, half %b) 1506 %fneg = fneg half %max 1507 ret half %fneg 1508} 1509 1510define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 { 1511; SI-LABEL: v_fneg_maxnum_f16_no_ieee: 1512; SI: ; %bb.0: 1513; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1514; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 1515; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1516; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1517; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1518; SI-NEXT: v_min_f32_e32 v0, v0, v1 1519; SI-NEXT: s_setpc_b64 s[30:31] 1520; 1521; VI-LABEL: v_fneg_maxnum_f16_no_ieee: 1522; VI: ; %bb.0: 1523; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1524; VI-NEXT: v_min_f16_e64 v0, -v0, -v1 1525; VI-NEXT: s_setpc_b64 s[30:31] 1526; 1527; GFX11-LABEL: v_fneg_maxnum_f16_no_ieee: 1528; GFX11: ; %bb.0: 1529; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1530; GFX11-NEXT: v_min_f16_e64 v0, -v0, -v1 1531; GFX11-NEXT: s_setpc_b64 s[30:31] 1532 %max = call half @llvm.maxnum.f16(half %a, half %b) 1533 %fneg = fneg half %max 1534 ret half %fneg 1535} 1536 1537define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 { 1538; SI-LABEL: v_fneg_self_maxnum_f16_ieee: 1539; SI: ; %bb.0: 1540; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1541; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1542; SI-NEXT: s_setpc_b64 s[30:31] 1543; 1544; VI-LABEL: v_fneg_self_maxnum_f16_ieee: 1545; VI: ; %bb.0: 1546; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1547; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1548; VI-NEXT: s_setpc_b64 s[30:31] 1549; 1550; GFX11-LABEL: v_fneg_self_maxnum_f16_ieee: 1551; GFX11: ; %bb.0: 1552; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1553; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1554; GFX11-NEXT: s_setpc_b64 s[30:31] 1555 %max = call half @llvm.maxnum.f16(half %a, half %a) 1556 %max.fneg = fneg half %max 1557 ret half %max.fneg 1558} 1559 1560define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 { 1561; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee: 1562; SI: ; %bb.0: 1563; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1564; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1565; SI-NEXT: s_setpc_b64 s[30:31] 1566; 1567; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee: 1568; VI: ; %bb.0: 1569; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1570; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1571; VI-NEXT: s_setpc_b64 s[30:31] 1572; 1573; GFX11-LABEL: v_fneg_self_maxnum_f16_no_ieee: 1574; GFX11: ; %bb.0: 1575; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1576; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1577; GFX11-NEXT: s_setpc_b64 s[30:31] 1578 %max = call half @llvm.maxnum.f16(half %a, half %a) 1579 %max.fneg = fneg half %max 1580 ret half %max.fneg 1581} 1582 1583define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { 1584; SI-LABEL: v_fneg_posk_maxnum_f16_ieee: 1585; SI: ; %bb.0: 1586; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1587; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1588; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1589; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 1590; SI-NEXT: s_setpc_b64 s[30:31] 1591; 1592; VI-LABEL: v_fneg_posk_maxnum_f16_ieee: 1593; VI: ; %bb.0: 1594; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1595; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1596; VI-NEXT: v_min_f16_e32 v0, -4.0, v0 1597; VI-NEXT: s_setpc_b64 s[30:31] 1598; 1599; GFX11-LABEL: v_fneg_posk_maxnum_f16_ieee: 1600; GFX11: ; %bb.0: 1601; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1602; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1603; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1604; GFX11-NEXT: v_min_f16_e32 v0, -4.0, v0 1605; GFX11-NEXT: s_setpc_b64 s[30:31] 1606 %max = call half @llvm.maxnum.f16(half 4.0, half %a) 1607 %fneg = fneg half %max 1608 ret half %fneg 1609} 1610 1611define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 { 1612; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee: 1613; SI: ; %bb.0: 1614; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1615; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1616; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1617; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 1618; SI-NEXT: s_setpc_b64 s[30:31] 1619; 1620; VI-LABEL: v_fneg_posk_maxnum_f16_no_ieee: 1621; VI: ; %bb.0: 1622; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1623; VI-NEXT: v_min_f16_e64 v0, -v0, -4.0 1624; VI-NEXT: s_setpc_b64 s[30:31] 1625; 1626; GFX11-LABEL: v_fneg_posk_maxnum_f16_no_ieee: 1627; GFX11: ; %bb.0: 1628; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1629; GFX11-NEXT: v_min_f16_e64 v0, -v0, -4.0 1630; GFX11-NEXT: s_setpc_b64 s[30:31] 1631 %max = call half @llvm.maxnum.f16(half 4.0, half %a) 1632 %fneg = fneg half %max 1633 ret half %fneg 1634} 1635 1636define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { 1637; SI-LABEL: v_fneg_negk_maxnum_f16_ieee: 1638; SI: ; %bb.0: 1639; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1640; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1641; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1642; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 1643; SI-NEXT: s_setpc_b64 s[30:31] 1644; 1645; VI-LABEL: v_fneg_negk_maxnum_f16_ieee: 1646; VI: ; %bb.0: 1647; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1648; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1649; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 1650; VI-NEXT: s_setpc_b64 s[30:31] 1651; 1652; GFX11-LABEL: v_fneg_negk_maxnum_f16_ieee: 1653; GFX11: ; %bb.0: 1654; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1655; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1656; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1657; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 1658; GFX11-NEXT: s_setpc_b64 s[30:31] 1659 %max = call half @llvm.maxnum.f16(half -4.0, half %a) 1660 %fneg = fneg half %max 1661 ret half %fneg 1662} 1663 1664define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 { 1665; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee: 1666; SI: ; %bb.0: 1667; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1668; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1669; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1670; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 1671; SI-NEXT: s_setpc_b64 s[30:31] 1672; 1673; VI-LABEL: v_fneg_negk_maxnum_f16_no_ieee: 1674; VI: ; %bb.0: 1675; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1676; VI-NEXT: v_min_f16_e64 v0, -v0, 4.0 1677; VI-NEXT: s_setpc_b64 s[30:31] 1678; 1679; GFX11-LABEL: v_fneg_negk_maxnum_f16_no_ieee: 1680; GFX11: ; %bb.0: 1681; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1682; GFX11-NEXT: v_min_f16_e64 v0, -v0, 4.0 1683; GFX11-NEXT: s_setpc_b64 s[30:31] 1684 %max = call half @llvm.maxnum.f16(half -4.0, half %a) 1685 %fneg = fneg half %max 1686 ret half %fneg 1687} 1688 1689define half @v_fneg_0_maxnum_f16(half %a) #0 { 1690; SI-LABEL: v_fneg_0_maxnum_f16: 1691; SI: ; %bb.0: 1692; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1693; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1694; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1695; SI-NEXT: v_max_f32_e32 v0, 0, v0 1696; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1697; SI-NEXT: s_setpc_b64 s[30:31] 1698; 1699; VI-LABEL: v_fneg_0_maxnum_f16: 1700; VI: ; %bb.0: 1701; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1702; VI-NEXT: v_max_f16_e32 v0, 0, v0 1703; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1704; VI-NEXT: s_setpc_b64 s[30:31] 1705; 1706; GFX11-LABEL: v_fneg_0_maxnum_f16: 1707; GFX11: ; %bb.0: 1708; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1709; GFX11-NEXT: v_max_f16_e32 v0, 0, v0 1710; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1711; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1712; GFX11-NEXT: s_setpc_b64 s[30:31] 1713 %max = call nnan half @llvm.maxnum.f16(half 0.0, half %a) 1714 %fneg = fneg half %max 1715 ret half %fneg 1716} 1717 1718define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { 1719; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee: 1720; SI: ; %bb.0: 1721; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1722; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1723; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1724; SI-NEXT: v_min_f32_e32 v0, 0, v0 1725; SI-NEXT: s_setpc_b64 s[30:31] 1726; 1727; VI-LABEL: v_fneg_neg0_maxnum_f16_ieee: 1728; VI: ; %bb.0: 1729; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1730; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1731; VI-NEXT: v_min_f16_e32 v0, 0, v0 1732; VI-NEXT: s_setpc_b64 s[30:31] 1733; 1734; GFX11-LABEL: v_fneg_neg0_maxnum_f16_ieee: 1735; GFX11: ; %bb.0: 1736; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1737; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1738; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1739; GFX11-NEXT: v_min_f16_e32 v0, 0, v0 1740; GFX11-NEXT: s_setpc_b64 s[30:31] 1741 %max = call half @llvm.maxnum.f16(half -0.0, half %a) 1742 %fneg = fneg half %max 1743 ret half %fneg 1744} 1745 1746define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 { 1747; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: 1748; SI: ; %bb.0: 1749; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1750; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 1751; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1752; SI-NEXT: v_min_f32_e32 v0, 0, v0 1753; SI-NEXT: s_setpc_b64 s[30:31] 1754; 1755; VI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: 1756; VI: ; %bb.0: 1757; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1758; VI-NEXT: v_min_f16_e64 v0, -v0, 0 1759; VI-NEXT: s_setpc_b64 s[30:31] 1760; 1761; GFX11-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: 1762; GFX11: ; %bb.0: 1763; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1764; GFX11-NEXT: v_min_f16_e64 v0, -v0, 0 1765; GFX11-NEXT: s_setpc_b64 s[30:31] 1766 %max = call half @llvm.maxnum.f16(half -0.0, half %a) 1767 %fneg = fneg half %max 1768 ret half %fneg 1769} 1770 1771define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { 1772; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: 1773; SI: ; %bb.0: 1774; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1775; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1776; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1777; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1778; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1779; SI-NEXT: v_max_f32_e32 v0, 0, v0 1780; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 1781; SI-NEXT: s_setpc_b64 s[30:31] 1782; 1783; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: 1784; VI: ; %bb.0: 1785; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1786; VI-NEXT: v_max_f16_e32 v0, v0, v0 1787; VI-NEXT: v_max_f16_e32 v0, 0, v0 1788; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 1789; VI-NEXT: s_setpc_b64 s[30:31] 1790; 1791; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: 1792; GFX11: ; %bb.0: 1793; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1794; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 1795; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1796; GFX11-NEXT: v_max_f16_e32 v0, 0, v0 1797; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 1798; GFX11-NEXT: s_setpc_b64 s[30:31] 1799 %max = call half @llvm.maxnum.f16(half 0.0, half %a) 1800 %fneg = fneg half %max 1801 %mul = fmul half %fneg, %b 1802 ret half %mul 1803} 1804 1805define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { 1806; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee: 1807; SI: ; %bb.0: 1808; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1809; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1810; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1811; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1812; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1813; SI-NEXT: v_max_f32_e32 v0, 0, v0 1814; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 1815; SI-NEXT: s_setpc_b64 s[30:31] 1816; 1817; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee: 1818; VI: ; %bb.0: 1819; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1820; VI-NEXT: v_max_f16_e32 v0, 0, v0 1821; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 1822; VI-NEXT: s_setpc_b64 s[30:31] 1823; 1824; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee: 1825; GFX11: ; %bb.0: 1826; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1827; GFX11-NEXT: v_max_f16_e32 v0, 0, v0 1828; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1829; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 1830; GFX11-NEXT: s_setpc_b64 s[30:31] 1831 %max = call half @llvm.maxnum.f16(half 0.0, half %a) 1832 %fneg = fneg half %max 1833 %mul = fmul half %fneg, %b 1834 ret half %mul 1835} 1836 1837define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) #0 { 1838; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: 1839; SI: ; %bb.0: 1840; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1841; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1842; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1843; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 1844; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 1845; SI-NEXT: v_min_f32_e32 v0, v0, v1 1846; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 1847; SI-NEXT: s_setpc_b64 s[30:31] 1848; 1849; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: 1850; VI: ; %bb.0: 1851; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1852; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 1853; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1854; VI-NEXT: v_min_f16_e32 v0, v0, v1 1855; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0 1856; VI-NEXT: s_setpc_b64 s[30:31] 1857; 1858; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: 1859; GFX11: ; %bb.0: 1860; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1861; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 1862; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 1863; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1864; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 1865; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0 1866; GFX11-NEXT: s_setpc_b64 s[30:31] 1867 %max = call half @llvm.maxnum.f16(half %a, half %b) 1868 %fneg = fneg half %max 1869 %use1 = fmul half %max, 4.0 1870 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 1871 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 1872 ret { half, half } %insert.1 1873} 1874 1875define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) #4 { 1876; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: 1877; SI: ; %bb.0: 1878; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1879; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1880; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1881; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 1882; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 1883; SI-NEXT: v_min_f32_e32 v0, v0, v1 1884; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 1885; SI-NEXT: s_setpc_b64 s[30:31] 1886; 1887; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: 1888; VI: ; %bb.0: 1889; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1890; VI-NEXT: v_min_f16_e64 v0, -v0, -v1 1891; VI-NEXT: v_mov_b32_e32 v1, 0xc400 1892; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1893; VI-NEXT: v_or_b32_e32 v0, v0, v1 1894; VI-NEXT: s_setpc_b64 s[30:31] 1895; 1896; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: 1897; GFX11: ; %bb.0: 1898; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1899; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 1900; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1901; GFX11-NEXT: v_mul_f16_e32 v1, 4.0, v0 1902; GFX11-NEXT: v_pack_b32_f16 v0, -v0, v1 1903; GFX11-NEXT: s_setpc_b64 s[30:31] 1904 %max = call half @llvm.maxnum.f16(half %a, half %b) 1905 %fneg = fneg half %max 1906 %use1 = fmul half %max, 4.0 1907 %ins0 = insertelement <2 x half> undef, half %fneg, i32 0 1908 %ins1 = insertelement <2 x half> %ins0, half %use1, i32 1 1909 ret <2 x half> %ins1 1910} 1911 1912; -------------------------------------------------------------------------------- 1913; fma tests 1914; -------------------------------------------------------------------------------- 1915 1916define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 { 1917; SI-SAFE-LABEL: v_fneg_fma_f16: 1918; SI-SAFE: ; %bb.0: 1919; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1920; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 1921; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 1922; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 1923; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 1924; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 1925; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 1926; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 1927; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 1928; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 1929; 1930; SI-NSZ-LABEL: v_fneg_fma_f16: 1931; SI-NSZ: ; %bb.0: 1932; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1933; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 1934; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 1935; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 1936; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 1937; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 1938; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 1939; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 1940; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 1941; 1942; VI-SAFE-LABEL: v_fneg_fma_f16: 1943; VI-SAFE: ; %bb.0: 1944; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1945; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, v2 1946; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 1947; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 1948; 1949; VI-NSZ-LABEL: v_fneg_fma_f16: 1950; VI-NSZ: ; %bb.0: 1951; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1952; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 1953; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 1954; 1955; GFX11-SAFE-LABEL: v_fneg_fma_f16: 1956; GFX11-SAFE: ; %bb.0: 1957; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1958; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 1959; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 1960; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 1961; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 1962; 1963; GFX11-NSZ-LABEL: v_fneg_fma_f16: 1964; GFX11-NSZ: ; %bb.0: 1965; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1966; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 1967; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 1968 %fma = call half @llvm.fma.f16(half %a, half %b, half %c) 1969 %fneg = fneg half %fma 1970 ret half %fneg 1971} 1972 1973define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #0 { 1974; SI-LABEL: v_fneg_fma_store_use_fma_f16: 1975; SI: ; %bb.0: 1976; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1977; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1978; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1979; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1980; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1981; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1982; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1983; SI-NEXT: v_fma_f32 v1, v0, v1, v2 1984; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 1985; SI-NEXT: s_setpc_b64 s[30:31] 1986; 1987; VI-LABEL: v_fneg_fma_store_use_fma_f16: 1988; VI: ; %bb.0: 1989; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1990; VI-NEXT: v_fma_f16 v1, v0, v1, v2 1991; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 1992; VI-NEXT: s_setpc_b64 s[30:31] 1993; 1994; GFX11-LABEL: v_fneg_fma_store_use_fma_f16: 1995; GFX11: ; %bb.0: 1996; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1997; GFX11-NEXT: v_fma_f16 v1, v0, v1, v2 1998; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1999; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 2000; GFX11-NEXT: s_setpc_b64 s[30:31] 2001 %fma = call half @llvm.fma.f16(half %a, half %b, half %c) 2002 %fneg = fneg half %fma 2003 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 2004 %insert.1 = insertvalue { half, half } %insert.0, half %fma, 1 2005 ret { half, half } %insert.1 2006} 2007 2008define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #0 { 2009; SI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16: 2010; SI-SAFE: ; %bb.0: 2011; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2012; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2013; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2014; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2015; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2016; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2017; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2018; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2 2019; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 2020; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 2021; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2022; 2023; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16: 2024; SI-NSZ: ; %bb.0: 2025; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2026; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2027; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2028; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2029; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2030; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2031; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2032; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 2033; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 2034; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2035; 2036; VI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16: 2037; VI-SAFE: ; %bb.0: 2038; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2039; VI-SAFE-NEXT: v_fma_f16 v1, v0, v1, v2 2040; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 2041; VI-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1 2042; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2043; 2044; VI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16: 2045; VI-NSZ: ; %bb.0: 2046; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2047; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2048; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0 2049; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2050; 2051; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16: 2052; GFX11-SAFE: ; %bb.0: 2053; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2054; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 2055; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2056; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 2057; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v2 2058; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2059; 2060; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16: 2061; GFX11-NSZ: ; %bb.0: 2062; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2063; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2064; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 2065; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0 2066; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2067 %fma = call half @llvm.fma.f16(half %a, half %b, half %c) 2068 %fneg = fneg half %fma 2069 %use1 = fmul half %fma, 4.0 2070 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 2071 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 2072 ret { half, half } %insert.1 2073} 2074 2075define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 { 2076; SI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: 2077; SI-SAFE: ; %bb.0: 2078; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2079; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2080; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2081; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2082; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2083; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2084; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2085; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, v2 2086; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2087; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2088; 2089; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16: 2090; SI-NSZ: ; %bb.0: 2091; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2092; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2093; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2094; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2095; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2096; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2097; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2098; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 2099; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2100; 2101; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: 2102; VI-SAFE: ; %bb.0: 2103; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2104; VI-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2 2105; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2106; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2107; 2108; VI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16: 2109; VI-NSZ: ; %bb.0: 2110; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2111; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2 2112; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2113; 2114; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: 2115; GFX11-SAFE: ; %bb.0: 2116; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2117; GFX11-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2 2118; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2119; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2120; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2121; 2122; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16: 2123; GFX11-NSZ: ; %bb.0: 2124; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2125; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2 2126; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2127 %fneg.a = fneg half %a 2128 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c) 2129 %fneg = fneg half %fma 2130 ret half %fneg 2131} 2132 2133define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 { 2134; SI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: 2135; SI-SAFE: ; %bb.0: 2136; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2137; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2138; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2139; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2140; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2141; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2142; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2143; SI-SAFE-NEXT: v_fma_f32 v0, v0, -v1, v2 2144; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2145; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2146; 2147; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16: 2148; SI-NSZ: ; %bb.0: 2149; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2150; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2151; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2152; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2153; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2154; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2155; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2156; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 2157; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2158; 2159; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: 2160; VI-SAFE: ; %bb.0: 2161; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2162; VI-SAFE-NEXT: v_fma_f16 v0, v0, -v1, v2 2163; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2164; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2165; 2166; VI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16: 2167; VI-NSZ: ; %bb.0: 2168; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2169; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2 2170; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2171; 2172; GFX11-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: 2173; GFX11-SAFE: ; %bb.0: 2174; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2175; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, -v1, v2 2176; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2177; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2178; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2179; 2180; GFX11-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16: 2181; GFX11-NSZ: ; %bb.0: 2182; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2183; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2 2184; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2185 %fneg.b = fneg half %b 2186 %fma = call half @llvm.fma.f16(half %a, half %fneg.b, half %c) 2187 %fneg = fneg half %fma 2188 ret half %fneg 2189} 2190 2191define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 { 2192; SI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: 2193; SI-SAFE: ; %bb.0: 2194; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2195; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2196; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2197; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2198; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2199; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2200; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2201; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 2202; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2203; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2204; 2205; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16: 2206; SI-NSZ: ; %bb.0: 2207; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2208; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2209; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2210; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2211; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2212; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2213; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2214; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 2215; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2216; 2217; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: 2218; VI-SAFE: ; %bb.0: 2219; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2220; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, v2 2221; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2222; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2223; 2224; VI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16: 2225; VI-NSZ: ; %bb.0: 2226; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2227; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2228; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2229; 2230; GFX11-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: 2231; GFX11-SAFE: ; %bb.0: 2232; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2233; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 2234; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2235; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 2236; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2237; 2238; GFX11-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16: 2239; GFX11-NSZ: ; %bb.0: 2240; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2241; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2242; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2243 %fneg.a = fneg half %a 2244 %fneg.b = fneg half %b 2245 %fma = call half @llvm.fma.f16(half %fneg.a, half %fneg.b, half %c) 2246 %fneg = fneg half %fma 2247 ret half %fneg 2248} 2249 2250define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 { 2251; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: 2252; SI-SAFE: ; %bb.0: 2253; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2254; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2255; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2256; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2257; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2258; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2259; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2260; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, -v2 2261; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2262; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2263; 2264; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16: 2265; SI-NSZ: ; %bb.0: 2266; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2267; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2268; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2269; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2270; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2271; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2272; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2273; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, v2 2274; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2275; 2276; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: 2277; VI-SAFE: ; %bb.0: 2278; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2279; VI-SAFE-NEXT: v_fma_f16 v0, -v0, v1, -v2 2280; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2281; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2282; 2283; VI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16: 2284; VI-NSZ: ; %bb.0: 2285; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2286; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, v2 2287; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2288; 2289; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: 2290; GFX11-SAFE: ; %bb.0: 2291; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2292; GFX11-SAFE-NEXT: v_fma_f16 v0, -v0, v1, -v2 2293; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2294; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2295; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2296; 2297; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16: 2298; GFX11-NSZ: ; %bb.0: 2299; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2300; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, v2 2301; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2302 %fneg.a = fneg half %a 2303 %fneg.c = fneg half %c 2304 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %fneg.c) 2305 %fneg = fneg half %fma 2306 ret half %fneg 2307} 2308 2309define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 { 2310; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: 2311; SI-SAFE: ; %bb.0: 2312; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2313; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2314; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2315; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2316; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2317; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2318; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2319; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, -v2 2320; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2321; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2322; 2323; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16: 2324; SI-NSZ: ; %bb.0: 2325; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2326; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2327; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2328; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2329; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2330; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2331; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2332; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, v2 2333; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2334; 2335; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: 2336; VI-SAFE: ; %bb.0: 2337; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2338; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, -v2 2339; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2340; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2341; 2342; VI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16: 2343; VI-NSZ: ; %bb.0: 2344; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2345; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, v2 2346; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2347; 2348; GFX11-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: 2349; GFX11-SAFE: ; %bb.0: 2350; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2351; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, v1, -v2 2352; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2353; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2354; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2355; 2356; GFX11-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16: 2357; GFX11-NSZ: ; %bb.0: 2358; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2359; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, v2 2360; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2361 %fneg.c = fneg half %c 2362 %fma = call half @llvm.fma.f16(half %a, half %b, half %fneg.c) 2363 %fneg = fneg half %fma 2364 ret half %fneg 2365} 2366 2367define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half %c) #0 { 2368; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: 2369; SI-SAFE: ; %bb.0: 2370; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2371; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2372; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2373; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v3, -v0 2374; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2375; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v1 2376; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 2377; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 2378; SI-SAFE-NEXT: v_fma_f32 v0, v3, v4, v2 2379; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2380; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2381; 2382; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: 2383; SI-NSZ: ; %bb.0: 2384; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2385; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2386; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2387; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2388; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2389; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 2390; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 2391; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 2392; SI-NSZ-NEXT: v_fma_f32 v0, v4, v3, -v2 2393; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2394; 2395; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: 2396; VI-SAFE: ; %bb.0: 2397; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2398; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v0 2399; VI-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2 2400; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2401; VI-SAFE-NEXT: v_mov_b32_e32 v1, v3 2402; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2403; 2404; VI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: 2405; VI-NSZ: ; %bb.0: 2406; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2407; VI-NSZ-NEXT: v_xor_b32_e32 v3, 0x8000, v0 2408; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2 2409; VI-NSZ-NEXT: v_mov_b32_e32 v1, v3 2410; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2411; 2412; GFX11-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: 2413; GFX11-SAFE: ; %bb.0: 2414; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2415; GFX11-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2 2416; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2417; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1 2418; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 2419; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2 2420; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2421; 2422; GFX11-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: 2423; GFX11-NSZ: ; %bb.0: 2424; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2425; GFX11-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2 2426; GFX11-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 2427; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) 2428; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2 2429; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2430 %fneg.a = fneg half %a 2431 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c) 2432 %fneg = fneg half %fma 2433 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 2434 %insert.1 = insertvalue { half, half } %insert.0, half %fneg.a, 1 2435 ret { half, half } %insert.1 2436} 2437 2438define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half %c, half %d) #0 { 2439; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: 2440; SI-SAFE: ; %bb.0: 2441; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2442; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2443; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2444; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 2445; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 2446; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2447; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2448; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 2449; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 2450; SI-SAFE-NEXT: v_fma_f32 v0, v4, v1, v2 2451; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2452; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v3 2453; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2454; 2455; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: 2456; SI-NSZ: ; %bb.0: 2457; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2458; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 2459; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2460; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2461; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2462; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 2463; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2464; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2465; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 2466; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v0 2467; SI-NSZ-NEXT: v_fma_f32 v0, v4, v1, -v2 2468; SI-NSZ-NEXT: v_mul_f32_e32 v1, v5, v3 2469; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2470; 2471; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: 2472; VI-SAFE: ; %bb.0: 2473; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2474; VI-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2 2475; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1 2476; VI-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v3 2477; VI-SAFE-NEXT: v_mov_b32_e32 v0, v2 2478; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2479; 2480; VI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: 2481; VI-NSZ: ; %bb.0: 2482; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2483; VI-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2 2484; VI-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v3 2485; VI-NSZ-NEXT: v_mov_b32_e32 v0, v2 2486; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2487; 2488; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: 2489; GFX11-SAFE: ; %bb.0: 2490; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2491; GFX11-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2 2492; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2493; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1 2494; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v3 2495; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2 2496; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2497; 2498; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: 2499; GFX11-NSZ: ; %bb.0: 2500; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2501; GFX11-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2 2502; GFX11-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v3 2503; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) 2504; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2 2505; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2506 %fneg.a = fneg half %a 2507 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c) 2508 %fneg = fneg half %fma 2509 %use1 = fmul half %fneg.a, %d 2510 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 2511 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 2512 ret { half, half } %insert.1 2513} 2514 2515; -------------------------------------------------------------------------------- 2516; fmad tests 2517; -------------------------------------------------------------------------------- 2518 2519define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 { 2520; SI-SAFE-LABEL: v_fneg_fmad_f16: 2521; SI-SAFE: ; %bb.0: 2522; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2523; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2524; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2525; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2526; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2527; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2528; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2529; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 2530; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 2531; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2532; 2533; SI-NSZ-LABEL: v_fneg_fmad_f16: 2534; SI-NSZ: ; %bb.0: 2535; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2536; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2537; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2538; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v1, -v1 2539; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2540; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2541; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2542; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 2543; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2544; 2545; VI-SAFE-LABEL: v_fneg_fmad_f16: 2546; VI-SAFE: ; %bb.0: 2547; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2548; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, v2 2549; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2550; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2551; 2552; VI-NSZ-LABEL: v_fneg_fmad_f16: 2553; VI-NSZ: ; %bb.0: 2554; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2555; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2556; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2557; 2558; GFX11-SAFE-LABEL: v_fneg_fmad_f16: 2559; GFX11-SAFE: ; %bb.0: 2560; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2561; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 2562; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2563; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 2564; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2565; 2566; GFX11-NSZ-LABEL: v_fneg_fmad_f16: 2567; GFX11-NSZ: ; %bb.0: 2568; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2569; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2570; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2571 %fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c) 2572 %fneg = fneg half %fma 2573 ret half %fneg 2574} 2575 2576define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { 2577; SI-SAFE-LABEL: v_fneg_fmad_v4f32: 2578; SI-SAFE: ; %bb.0: 2579; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2580; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 2581; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 2582; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 2583; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2584; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 2585; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 2586; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 2587; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2588; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 2589; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 2590; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 2591; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2592; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 2593; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 2594; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 2595; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 2596; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 2597; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 2598; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 2599; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 2600; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 2601; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2602; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2603; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2604; SI-SAFE-NEXT: v_mac_f32_e32 v11, v3, v7 2605; SI-SAFE-NEXT: v_mac_f32_e32 v10, v2, v6 2606; SI-SAFE-NEXT: v_mac_f32_e32 v9, v1, v5 2607; SI-SAFE-NEXT: v_mac_f32_e32 v8, v0, v4 2608; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v8 2609; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v9 2610; SI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v10 2611; SI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v11 2612; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2613; 2614; SI-NSZ-LABEL: v_fneg_fmad_v4f32: 2615; SI-NSZ: ; %bb.0: 2616; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2617; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v11, v11 2618; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 2619; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v10, v10 2620; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v7, v7 2621; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2622; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v9, v9 2623; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2624; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6 2625; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v8, v8 2626; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2627; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v4, v4 2628; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v5, v5 2629; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v11, v11 2630; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 2631; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v10, v10 2632; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2633; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v9, v9 2634; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 2635; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 2636; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2637; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4 2638; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v5 2639; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v6, -v6 2640; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v7, -v7 2641; SI-NSZ-NEXT: v_mad_f32 v0, v0, v4, -v8 2642; SI-NSZ-NEXT: v_mad_f32 v1, v1, v5, -v9 2643; SI-NSZ-NEXT: v_mad_f32 v2, v2, v6, -v10 2644; SI-NSZ-NEXT: v_mad_f32 v3, v3, v7, -v11 2645; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2646; 2647; VI-SAFE-LABEL: v_fneg_fmad_v4f32: 2648; VI-SAFE: ; %bb.0: 2649; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2650; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v5 2651; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v3 2652; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2653; VI-SAFE-NEXT: v_fma_f16 v6, v8, v7, v6 2654; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2655; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2656; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 2657; VI-SAFE-NEXT: v_fma_f16 v7, v9, v8, v7 2658; VI-SAFE-NEXT: v_fma_f16 v0, v0, v2, v4 2659; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v7 2660; VI-SAFE-NEXT: v_fma_f16 v1, v1, v3, v5 2661; VI-SAFE-NEXT: v_or_b32_e32 v0, v0, v2 2662; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 2663; VI-SAFE-NEXT: v_or_b32_e32 v1, v1, v2 2664; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 2665; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 2666; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2667; 2668; VI-NSZ-LABEL: v_fneg_fmad_v4f32: 2669; VI-NSZ: ; %bb.0: 2670; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2671; VI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v5 2672; VI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v3 2673; VI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2674; VI-NSZ-NEXT: v_fma_f16 v6, v8, -v7, -v6 2675; VI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2676; VI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2677; VI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v0 2678; VI-NSZ-NEXT: v_fma_f16 v7, v9, -v8, -v7 2679; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v2, -v4 2680; VI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v7 2681; VI-NSZ-NEXT: v_fma_f16 v1, v1, -v3, -v5 2682; VI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2 2683; VI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v6 2684; VI-NSZ-NEXT: v_or_b32_e32 v1, v1, v2 2685; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2686; 2687; GFX11-SAFE-LABEL: v_fneg_fmad_v4f32: 2688; GFX11-SAFE: ; %bb.0: 2689; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2690; GFX11-SAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 2691; GFX11-SAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 2692; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2693; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 2694; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 2695; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2696; 2697; GFX11-NSZ-LABEL: v_fneg_fmad_v4f32: 2698; GFX11-NSZ: ; %bb.0: 2699; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2700; GFX11-NSZ-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] 2701; GFX11-NSZ-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] 2702; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2703 %fma = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) 2704 %fneg = fneg <4 x half> %fma 2705 ret <4 x half> %fneg 2706} 2707 2708define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c) #0 { 2709; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: 2710; SI-SAFE: ; %bb.0: 2711; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2712; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 2713; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 2714; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 2715; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 2716; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 2717; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 2718; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 2719; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 2720; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v2 2721; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 2722; 2723; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16: 2724; SI-NSZ: ; %bb.0: 2725; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2726; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 2727; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 2728; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 2729; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 2730; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 2731; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 2732; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 2733; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 2734; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 2735; 2736; VI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: 2737; VI-SAFE: ; %bb.0: 2738; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2739; VI-SAFE-NEXT: v_fma_f16 v1, v0, v1, v2 2740; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1 2741; VI-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1 2742; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 2743; 2744; VI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16: 2745; VI-NSZ: ; %bb.0: 2746; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2747; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2748; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0 2749; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 2750; 2751; GFX11-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: 2752; GFX11-SAFE: ; %bb.0: 2753; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2754; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 2755; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 2756; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2 2757; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v2 2758; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 2759; 2760; GFX11-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16: 2761; GFX11-NSZ: ; %bb.0: 2762; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2763; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2 2764; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 2765; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0 2766; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 2767 %fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c) 2768 %fneg = fneg half %fma 2769 %use1 = fmul half %fma, 4.0 2770 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 2771 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1 2772 ret { half, half } %insert.1 2773} 2774 2775; -------------------------------------------------------------------------------- 2776; fp_extend tests 2777; -------------------------------------------------------------------------------- 2778 2779define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 { 2780; SI-LABEL: v_fneg_fp_extend_f16_to_f64: 2781; SI: ; %bb.0: 2782; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2783; SI-NEXT: v_cvt_f64_f32_e64 v[0:1], -v0 2784; SI-NEXT: s_setpc_b64 s[30:31] 2785; 2786; VI-LABEL: v_fneg_fp_extend_f16_to_f64: 2787; VI: ; %bb.0: 2788; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2789; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2790; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 2791; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2792; VI-NEXT: s_setpc_b64 s[30:31] 2793; 2794; GFX11-LABEL: v_fneg_fp_extend_f16_to_f64: 2795; GFX11: ; %bb.0: 2796; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2797; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2798; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2799; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 2800; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2801; GFX11-NEXT: s_setpc_b64 s[30:31] 2802 %fpext = fpext half %a to double 2803 %fneg = fneg double %fpext 2804 ret double %fneg 2805} 2806 2807define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 { 2808; SI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64: 2809; SI: ; %bb.0: 2810; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2811; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2812; SI-NEXT: s_setpc_b64 s[30:31] 2813; 2814; VI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64: 2815; VI: ; %bb.0: 2816; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2817; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 2818; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2819; VI-NEXT: s_setpc_b64 s[30:31] 2820; 2821; GFX11-LABEL: v_fneg_fp_extend_fneg_f16_to_f64: 2822; GFX11: ; %bb.0: 2823; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2824; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 2825; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2826; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2827; GFX11-NEXT: s_setpc_b64 s[30:31] 2828 %fneg.a = fneg half %a 2829 %fpext = fpext half %fneg.a to double 2830 %fneg = fneg double %fpext 2831 ret double %fneg 2832} 2833 2834define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0 { 2835; SI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64: 2836; SI: ; %bb.0: 2837; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2838; SI-NEXT: v_mov_b32_e32 v2, v0 2839; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 2840; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 2841; SI-NEXT: s_setpc_b64 s[30:31] 2842; 2843; VI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64: 2844; VI: ; %bb.0: 2845; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2846; VI-NEXT: v_mov_b32_e32 v2, v0 2847; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 2848; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 2849; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2850; VI-NEXT: s_setpc_b64 s[30:31] 2851; 2852; GFX11-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64: 2853; GFX11: ; %bb.0: 2854; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2855; GFX11-NEXT: v_mov_b32_e32 v2, v0 2856; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2857; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2 2858; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 2859; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2860; GFX11-NEXT: s_setpc_b64 s[30:31] 2861 %fneg.a = fneg half %a 2862 %fpext = fpext half %fneg.a to double 2863 %fneg = fneg double %fpext 2864 %insert.0 = insertvalue { double, half } poison, double %fneg, 0 2865 %insert.1 = insertvalue { double, half } %insert.0, half %fneg.a, 1 2866 ret { double, half } %insert.1 2867} 2868 2869define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) #0 { 2870; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: 2871; SI: ; %bb.0: 2872; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2873; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 2874; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 2875; SI-NEXT: v_mov_b32_e32 v0, v2 2876; SI-NEXT: s_setpc_b64 s[30:31] 2877; 2878; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: 2879; VI: ; %bb.0: 2880; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2881; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 2882; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 2883; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 2884; VI-NEXT: v_mov_b32_e32 v0, v2 2885; VI-NEXT: s_setpc_b64 s[30:31] 2886; 2887; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: 2888; GFX11: ; %bb.0: 2889; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2890; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 2891; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2892; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 2893; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 2894; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2895; GFX11-NEXT: v_mov_b32_e32 v0, v2 2896; GFX11-NEXT: s_setpc_b64 s[30:31] 2897 %fpext = fpext half %a to double 2898 %fneg = fneg double %fpext 2899 %insert.0 = insertvalue { double, double } poison, double %fneg, 0 2900 %insert.1 = insertvalue { double, double } %insert.0, double %fpext, 1 2901 ret { double, double } %insert.1 2902} 2903 2904define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(half %a) #0 { 2905; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: 2906; SI: ; %bb.0: 2907; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2908; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2909; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 2910; SI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0 2911; SI-NEXT: v_mov_b32_e32 v1, v4 2912; SI-NEXT: s_setpc_b64 s[30:31] 2913; 2914; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: 2915; VI: ; %bb.0: 2916; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2917; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 2918; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2919; VI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0 2920; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2921; VI-NEXT: s_setpc_b64 s[30:31] 2922; 2923; GFX11-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: 2924; GFX11: ; %bb.0: 2925; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2926; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 2927; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2928; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2929; GFX11-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0 2930; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2931; GFX11-NEXT: s_setpc_b64 s[30:31] 2932 %fpext = fpext half %a to double 2933 %fneg = fneg double %fpext 2934 %mul = fmul double %fpext, 4.0 2935 %insert.0 = insertvalue { double, double } poison, double %fneg, 0 2936 %insert.1 = insertvalue { double, double } %insert.0, double %mul, 1 2937 ret { double, double } %insert.1 2938} 2939 2940define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0 { 2941; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 2942; SI: ; %bb.0: 2943; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2944; SI-NEXT: v_mov_b32_e32 v1, v0 2945; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 2946; SI-NEXT: s_setpc_b64 s[30:31] 2947; 2948; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 2949; VI: ; %bb.0: 2950; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2951; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 2952; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 2953; VI-NEXT: s_setpc_b64 s[30:31] 2954; 2955; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 2956; GFX11: ; %bb.0: 2957; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2958; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 2959; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2960; GFX11-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 2961; GFX11-NEXT: s_setpc_b64 s[30:31] 2962 %fpext = fpext half %a to float 2963 %fneg = fneg float %fpext 2964 %insert.0 = insertvalue { float, float } poison, float %fneg, 0 2965 %insert.1 = insertvalue { float, float } %insert.0, float %fpext, 1 2966 ret { float, float } %insert.1 2967} 2968 2969; -------------------------------------------------------------------------------- 2970; fp_round tests 2971; -------------------------------------------------------------------------------- 2972 2973define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { 2974; SI-LABEL: v_fneg_fp_round_f64_to_f16: 2975; SI: ; %bb.0: 2976; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2977; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2978; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 2979; SI-NEXT: v_or_b32_e32 v0, v2, v0 2980; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 2981; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 2982; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 2983; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 2984; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 2985; SI-NEXT: s_movk_i32 s4, 0x3f1 2986; SI-NEXT: v_or_b32_e32 v0, v2, v0 2987; SI-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 2988; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 2989; SI-NEXT: v_med3_i32 v4, v4, 0, 13 2990; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 2991; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 2992; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 2993; SI-NEXT: s_movk_i32 s4, 0xfc10 2994; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 2995; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 2996; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 2997; SI-NEXT: v_or_b32_e32 v2, v5, v2 2998; SI-NEXT: v_or_b32_e32 v4, v0, v4 2999; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 3000; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3001; SI-NEXT: v_and_b32_e32 v4, 7, v2 3002; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 3003; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 3004; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3005; SI-NEXT: s_or_b64 vcc, s[4:5], vcc 3006; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3007; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 3008; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 3009; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3010; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3011; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3012; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3013; SI-NEXT: s_movk_i32 s4, 0x40f 3014; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3015; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 3016; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3017; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3018; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 3019; SI-NEXT: v_or_b32_e32 v0, v1, v0 3020; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3021; SI-NEXT: s_setpc_b64 s[30:31] 3022; 3023; VI-LABEL: v_fneg_fp_round_f64_to_f16: 3024; VI: ; %bb.0: 3025; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3026; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 3027; VI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 3028; VI-NEXT: v_or_b32_e32 v0, v2, v0 3029; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3030; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3031; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3032; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2 3033; VI-NEXT: v_bfe_u32 v3, v1, 20, 11 3034; VI-NEXT: s_movk_i32 s4, 0x3f1 3035; VI-NEXT: v_or_b32_e32 v0, v2, v0 3036; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v3 3037; VI-NEXT: v_or_b32_e32 v2, 0x1000, v0 3038; VI-NEXT: v_med3_i32 v4, v4, 0, 13 3039; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 3040; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 3041; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 3042; VI-NEXT: s_movk_i32 s4, 0xfc10 3043; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3044; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 3045; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 3046; VI-NEXT: v_or_b32_e32 v2, v5, v2 3047; VI-NEXT: v_or_b32_e32 v4, v0, v4 3048; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 3049; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3050; VI-NEXT: v_and_b32_e32 v4, 7, v2 3051; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 3052; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 3053; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3054; VI-NEXT: s_or_b64 vcc, s[4:5], vcc 3055; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3056; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 3057; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 3058; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3059; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3060; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3061; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3062; VI-NEXT: s_movk_i32 s4, 0x40f 3063; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3064; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 3065; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3066; VI-NEXT: v_mov_b32_e32 v2, 0x8000 3067; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3068; VI-NEXT: v_or_b32_e32 v0, v1, v0 3069; VI-NEXT: s_setpc_b64 s[30:31] 3070; 3071; GFX11-LABEL: v_fneg_fp_round_f64_to_f16: 3072; GFX11: ; %bb.0: 3073; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3074; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 3075; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 3076; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 3077; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3078; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11 3079; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3080; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3081; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3082; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 3083; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 3084; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 3085; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3086; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 3087; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 3088; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 3089; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3090; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 3091; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 3092; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3093; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 3094; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 3095; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 3096; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 3097; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 3098; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 3099; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 3100; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 3101; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 3102; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3103; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00 3104; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 3105; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3106; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3107; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 3108; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4 3109; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo 3110; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3111; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo 3112; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 3113; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo 3114; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 3115; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3116; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo 3117; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 3118; GFX11-NEXT: s_setpc_b64 s[30:31] 3119 %fpround = fptrunc double %a to half 3120 %fneg = fneg half %fpround 3121 ret half %fneg 3122} 3123 3124define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { 3125; SI-LABEL: v_fneg_fp_round_fneg_f64_to_f16: 3126; SI: ; %bb.0: 3127; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3128; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 3129; SI-NEXT: v_or_b32_e32 v0, v2, v0 3130; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3131; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3132; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3133; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 3134; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 3135; SI-NEXT: v_or_b32_e32 v0, v2, v0 3136; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3 3137; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 3138; SI-NEXT: v_med3_i32 v4, v4, 0, 13 3139; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 3140; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 3141; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 3142; SI-NEXT: s_movk_i32 s4, 0xfc10 3143; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3144; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 3145; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 3146; SI-NEXT: v_or_b32_e32 v2, v5, v2 3147; SI-NEXT: v_or_b32_e32 v4, v0, v4 3148; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 3149; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3150; SI-NEXT: v_and_b32_e32 v4, 7, v2 3151; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 3152; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 3153; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3154; SI-NEXT: s_or_b64 vcc, s[4:5], vcc 3155; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3156; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 3157; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 3158; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3159; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3160; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3161; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3162; SI-NEXT: s_movk_i32 s4, 0x40f 3163; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3164; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 3165; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3166; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3167; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 3168; SI-NEXT: v_or_b32_e32 v0, v1, v0 3169; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3170; SI-NEXT: s_setpc_b64 s[30:31] 3171; 3172; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16: 3173; VI: ; %bb.0: 3174; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3175; VI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 3176; VI-NEXT: v_or_b32_e32 v0, v2, v0 3177; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3178; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3179; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3180; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2 3181; VI-NEXT: v_bfe_u32 v3, v1, 20, 11 3182; VI-NEXT: v_or_b32_e32 v0, v2, v0 3183; VI-NEXT: v_sub_u32_e32 v4, vcc, 0x3f1, v3 3184; VI-NEXT: v_or_b32_e32 v2, 0x1000, v0 3185; VI-NEXT: v_med3_i32 v4, v4, 0, 13 3186; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 3187; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 3188; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 3189; VI-NEXT: s_movk_i32 s4, 0xfc10 3190; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3191; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 3192; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 3193; VI-NEXT: v_or_b32_e32 v2, v5, v2 3194; VI-NEXT: v_or_b32_e32 v4, v0, v4 3195; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 3196; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3197; VI-NEXT: v_and_b32_e32 v4, 7, v2 3198; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 3199; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 3200; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3201; VI-NEXT: s_or_b64 vcc, s[4:5], vcc 3202; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3203; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 3204; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 3205; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3206; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3207; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3208; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3209; VI-NEXT: s_movk_i32 s4, 0x40f 3210; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3211; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 3212; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3213; VI-NEXT: v_mov_b32_e32 v2, 0x8000 3214; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3215; VI-NEXT: v_or_b32_e32 v0, v1, v0 3216; VI-NEXT: s_setpc_b64 s[30:31] 3217; 3218; GFX11-LABEL: v_fneg_fp_round_fneg_f64_to_f16: 3219; GFX11: ; %bb.0: 3220; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3221; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 3222; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3223; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11 3224; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3225; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 3226; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3227; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 3228; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 3229; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3230; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 3231; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 3232; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3233; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 3234; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 3235; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 3236; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3237; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 3238; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 3239; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 3240; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3241; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 3242; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 3243; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 3244; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 3245; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 3246; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3247; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 3248; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00 3249; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3250; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 3251; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3252; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 3253; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4 3254; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3255; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo 3256; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo 3257; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 3258; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3259; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo 3260; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 3261; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo 3262; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3263; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 3264; GFX11-NEXT: s_setpc_b64 s[30:31] 3265 %fneg.a = fneg double %a 3266 %fpround = fptrunc double %fneg.a to half 3267 %fneg = fneg half %fpround 3268 ret half %fneg 3269} 3270 3271define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 { 3272; SI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16: 3273; SI: ; %bb.0: 3274; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3275; SI-NEXT: v_mov_b32_e32 v3, v0 3276; SI-NEXT: v_and_b32_e32 v0, 0x1ff, v1 3277; SI-NEXT: v_or_b32_e32 v0, v0, v3 3278; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3279; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3280; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3281; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 3282; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 3283; SI-NEXT: s_movk_i32 s4, 0x3f1 3284; SI-NEXT: v_or_b32_e32 v0, v2, v0 3285; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 3286; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 3287; SI-NEXT: v_med3_i32 v5, v5, 0, 13 3288; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v2 3289; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 3290; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 3291; SI-NEXT: s_movk_i32 s4, 0xfc10 3292; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3293; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 3294; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 3295; SI-NEXT: v_or_b32_e32 v2, v6, v2 3296; SI-NEXT: v_or_b32_e32 v5, v0, v5 3297; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 3298; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc 3299; SI-NEXT: v_and_b32_e32 v5, 7, v2 3300; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 3301; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v5 3302; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3303; SI-NEXT: s_or_b64 vcc, s[4:5], vcc 3304; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3305; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 3306; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 3307; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc 3308; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3309; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3310; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3311; SI-NEXT: s_movk_i32 s4, 0x40f 3312; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3313; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 3314; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3315; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 3316; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 3317; SI-NEXT: v_or_b32_e32 v0, v2, v0 3318; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3319; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 3320; SI-NEXT: v_mov_b32_e32 v1, v3 3321; SI-NEXT: s_setpc_b64 s[30:31] 3322; 3323; VI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16: 3324; VI: ; %bb.0: 3325; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3326; VI-NEXT: v_mov_b32_e32 v3, v0 3327; VI-NEXT: v_and_b32_e32 v0, 0x1ff, v1 3328; VI-NEXT: v_or_b32_e32 v0, v0, v3 3329; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3330; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v1 3331; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3332; VI-NEXT: v_and_b32_e32 v4, 0xffe, v4 3333; VI-NEXT: v_bfe_u32 v5, v1, 20, 11 3334; VI-NEXT: s_movk_i32 s4, 0x3f1 3335; VI-NEXT: v_or_b32_e32 v0, v4, v0 3336; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v5 3337; VI-NEXT: v_or_b32_e32 v4, 0x1000, v0 3338; VI-NEXT: v_med3_i32 v6, v6, 0, 13 3339; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v4 3340; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 3341; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 3342; VI-NEXT: s_movk_i32 s4, 0xfc10 3343; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3344; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v5 3345; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v5 3346; VI-NEXT: v_or_b32_e32 v4, v7, v4 3347; VI-NEXT: v_or_b32_e32 v6, v0, v6 3348; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 3349; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 3350; VI-NEXT: v_and_b32_e32 v6, 7, v4 3351; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 3352; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v6 3353; VI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 3354; VI-NEXT: s_or_b64 vcc, s[4:5], vcc 3355; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 3356; VI-NEXT: v_mov_b32_e32 v6, 0x7c00 3357; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 3358; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 3359; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3360; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3361; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3362; VI-NEXT: s_movk_i32 s4, 0x40f 3363; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3364; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 3365; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 3366; VI-NEXT: v_mov_b32_e32 v4, 0x8000 3367; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 3368; VI-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3369; VI-NEXT: v_or_b32_e32 v0, v1, v0 3370; VI-NEXT: v_mov_b32_e32 v1, v3 3371; VI-NEXT: s_setpc_b64 s[30:31] 3372; 3373; GFX11-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16: 3374; GFX11: ; %bb.0: 3375; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3376; GFX11-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0 3377; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 3378; GFX11-NEXT: v_bfe_u32 v4, v1, 20, 11 3379; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3380; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 3381; GFX11-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4 3382; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 3383; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3384; GFX11-NEXT: v_and_or_b32 v2, 0xffe, v3, v2 3385; GFX11-NEXT: v_med3_i32 v3, v5, 0, 13 3386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3387; GFX11-NEXT: v_or_b32_e32 v5, 0x1000, v2 3388; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 3389; GFX11-NEXT: v_lshrrev_b32_e32 v6, v3, v5 3390; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3391; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v6 3392; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5 3393; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 3394; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3395; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 3396; GFX11-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4 3397; GFX11-NEXT: v_lshl_or_b32 v5, v4, 12, v2 3398; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4 3399; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 3400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3401; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo 3402; GFX11-NEXT: v_lshl_or_b32 v2, v2, 9, 0x7c00 3403; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3404; GFX11-NEXT: v_and_b32_e32 v5, 7, v3 3405; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 3406; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5 3407; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5 3408; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3409; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 3410; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo 3411; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 3412; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4 3413; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo 3414; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4 3415; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3416; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo 3417; GFX11-NEXT: v_and_or_b32 v3, 0x8000, v5, v2 3418; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 3419; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 3420; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3 3421; GFX11-NEXT: s_setpc_b64 s[30:31] 3422 %fneg.a = fneg double %a 3423 %fpround = fptrunc double %fneg.a to half 3424 %fneg = fneg half %fpround 3425 %insert.0 = insertvalue { half, double } poison, half %fneg, 0 3426 %insert.1 = insertvalue { half, double } %insert.0, double %fneg.a, 1 3427 ret { half, double } %insert.1 3428} 3429 3430define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, double %c) #0 { 3431; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16: 3432; SI: ; %bb.0: 3433; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3434; SI-NEXT: v_and_b32_e32 v4, 0x1ff, v1 3435; SI-NEXT: v_or_b32_e32 v4, v4, v0 3436; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3437; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3438; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3439; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5 3440; SI-NEXT: v_bfe_u32 v6, v1, 20, 11 3441; SI-NEXT: s_movk_i32 s4, 0x3f1 3442; SI-NEXT: v_or_b32_e32 v4, v5, v4 3443; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 3444; SI-NEXT: v_or_b32_e32 v5, 0x1000, v4 3445; SI-NEXT: v_med3_i32 v7, v7, 0, 13 3446; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v5 3447; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 3448; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 3449; SI-NEXT: s_movk_i32 s4, 0xfc10 3450; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 3451; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v6 3452; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 3453; SI-NEXT: v_or_b32_e32 v5, v8, v5 3454; SI-NEXT: v_or_b32_e32 v7, v4, v7 3455; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 3456; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc 3457; SI-NEXT: v_and_b32_e32 v7, 7, v5 3458; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 3459; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v7 3460; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 3461; SI-NEXT: s_or_b64 vcc, s[4:5], vcc 3462; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 3463; SI-NEXT: v_mov_b32_e32 v7, 0x7c00 3464; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 3465; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc 3466; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3467; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3468; SI-NEXT: v_lshlrev_b32_e32 v4, 9, v4 3469; SI-NEXT: s_movk_i32 s4, 0x40f 3470; SI-NEXT: v_or_b32_e32 v4, 0x7c00, v4 3471; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6 3472; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 3473; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3474; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 3475; SI-NEXT: v_or_b32_e32 v4, v5, v4 3476; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 3477; SI-NEXT: v_mul_f64 v[1:2], -v[0:1], v[2:3] 3478; SI-NEXT: v_mov_b32_e32 v0, v4 3479; SI-NEXT: s_setpc_b64 s[30:31] 3480; 3481; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16: 3482; VI: ; %bb.0: 3483; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3484; VI-NEXT: v_and_b32_e32 v4, 0x1ff, v1 3485; VI-NEXT: v_or_b32_e32 v4, v4, v0 3486; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3487; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3488; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3489; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5 3490; VI-NEXT: v_bfe_u32 v6, v1, 20, 11 3491; VI-NEXT: s_movk_i32 s4, 0x3f1 3492; VI-NEXT: v_or_b32_e32 v4, v5, v4 3493; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v6 3494; VI-NEXT: v_or_b32_e32 v5, 0x1000, v4 3495; VI-NEXT: v_med3_i32 v7, v7, 0, 13 3496; VI-NEXT: v_lshrrev_b32_e32 v8, v7, v5 3497; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 3498; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 3499; VI-NEXT: s_movk_i32 s4, 0xfc10 3500; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 3501; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 3502; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 3503; VI-NEXT: v_or_b32_e32 v5, v8, v5 3504; VI-NEXT: v_or_b32_e32 v7, v4, v7 3505; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 3506; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc 3507; VI-NEXT: v_and_b32_e32 v7, 7, v5 3508; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 3509; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v7 3510; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 3511; VI-NEXT: s_or_b64 vcc, s[4:5], vcc 3512; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 3513; VI-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3] 3514; VI-NEXT: v_mov_b32_e32 v7, 0x7c00 3515; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 3516; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc 3517; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3518; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3519; VI-NEXT: v_lshlrev_b32_e32 v4, 9, v4 3520; VI-NEXT: s_movk_i32 s4, 0x40f 3521; VI-NEXT: v_or_b32_e32 v4, 0x7c00, v4 3522; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6 3523; VI-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc 3524; VI-NEXT: v_mov_b32_e32 v4, 0x8000 3525; VI-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3526; VI-NEXT: v_or_b32_e32 v0, v1, v0 3527; VI-NEXT: v_mov_b32_e32 v1, v2 3528; VI-NEXT: v_mov_b32_e32 v2, v3 3529; VI-NEXT: s_setpc_b64 s[30:31] 3530; 3531; GFX11-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16: 3532; GFX11: ; %bb.0: 3533; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3534; GFX11-NEXT: v_and_or_b32 v4, 0x1ff, v1, v0 3535; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3536; GFX11-NEXT: v_bfe_u32 v6, v1, 20, 11 3537; GFX11-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3] 3538; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3539; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 3540; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) 3541; GFX11-NEXT: v_sub_nc_u32_e32 v7, 0x3f1, v6 3542; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 3543; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo 3544; GFX11-NEXT: v_and_or_b32 v4, 0xffe, v5, v4 3545; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 3546; GFX11-NEXT: v_med3_i32 v5, v7, 0, 13 3547; GFX11-NEXT: v_or_b32_e32 v7, 0x1000, v4 3548; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v4 3549; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3550; GFX11-NEXT: v_lshrrev_b32_e32 v8, v5, v7 3551; GFX11-NEXT: v_lshlrev_b32_e32 v5, v5, v8 3552; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) 3553; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v7 3554; GFX11-NEXT: v_lshl_or_b32 v7, v6, 12, v4 3555; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 3556; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo 3557; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 3558; GFX11-NEXT: v_lshl_or_b32 v4, v4, 9, 0x7c00 3559; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3560; GFX11-NEXT: v_or_b32_e32 v5, v8, v5 3561; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v5, vcc_lo 3562; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3563; GFX11-NEXT: v_and_b32_e32 v5, 7, v0 3564; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v0 3565; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5 3566; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5 3567; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3568; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo 3569; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo 3570; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 3571; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3572; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo 3573; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 3574; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo 3575; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3576; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 3577; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 3578; GFX11-NEXT: s_setpc_b64 s[30:31] 3579 %fneg.a = fneg double %a 3580 %fpround = fptrunc double %fneg.a to half 3581 %fneg = fneg half %fpround 3582 %use1 = fmul double %fneg.a, %c 3583 %insert.0 = insertvalue { half, double } poison, half %fneg, 0 3584 %insert.1 = insertvalue { half, double } %insert.0, double %use1, 1 3585 ret { half, double } %insert.1 3586} 3587 3588define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { 3589; SI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: 3590; SI: ; %bb.0: 3591; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3592; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 3593; SI-NEXT: v_or_b32_e32 v0, v2, v0 3594; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3595; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3596; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3597; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 3598; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 3599; SI-NEXT: v_or_b32_e32 v0, v2, v0 3600; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3 3601; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 3602; SI-NEXT: v_med3_i32 v4, v4, 0, 13 3603; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 3604; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 3605; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 3606; SI-NEXT: s_movk_i32 s4, 0xfc10 3607; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3608; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 3609; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 3610; SI-NEXT: v_or_b32_e32 v2, v5, v2 3611; SI-NEXT: v_or_b32_e32 v4, v0, v4 3612; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 3613; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3614; SI-NEXT: v_and_b32_e32 v4, 7, v2 3615; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 3616; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 3617; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3618; SI-NEXT: s_or_b64 vcc, s[4:5], vcc 3619; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3620; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 3621; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 3622; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3623; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3624; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3625; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3626; SI-NEXT: s_movk_i32 s4, 0x40f 3627; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3628; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 3629; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3630; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3631; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 3632; SI-NEXT: v_or_b32_e32 v1, v1, v0 3633; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 3634; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 3635; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 3636; SI-NEXT: s_setpc_b64 s[30:31] 3637; 3638; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: 3639; VI: ; %bb.0: 3640; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3641; VI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 3642; VI-NEXT: v_or_b32_e32 v0, v2, v0 3643; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3644; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3645; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3646; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2 3647; VI-NEXT: v_bfe_u32 v3, v1, 20, 11 3648; VI-NEXT: v_or_b32_e32 v0, v2, v0 3649; VI-NEXT: v_sub_u32_e32 v4, vcc, 0x3f1, v3 3650; VI-NEXT: v_or_b32_e32 v2, 0x1000, v0 3651; VI-NEXT: v_med3_i32 v4, v4, 0, 13 3652; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 3653; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 3654; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 3655; VI-NEXT: s_movk_i32 s4, 0xfc10 3656; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3657; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 3658; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 3659; VI-NEXT: v_or_b32_e32 v2, v5, v2 3660; VI-NEXT: v_or_b32_e32 v4, v0, v4 3661; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 3662; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3663; VI-NEXT: v_and_b32_e32 v4, 7, v2 3664; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 3665; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 3666; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3667; VI-NEXT: s_or_b64 vcc, s[4:5], vcc 3668; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 3669; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 3670; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 3671; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 3672; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3673; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3674; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 3675; VI-NEXT: s_movk_i32 s4, 0x40f 3676; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 3677; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 3678; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 3679; VI-NEXT: v_mov_b32_e32 v2, 0x8000 3680; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3681; VI-NEXT: v_or_b32_e32 v1, v1, v0 3682; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 3683; VI-NEXT: s_setpc_b64 s[30:31] 3684; 3685; GFX11-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: 3686; GFX11: ; %bb.0: 3687; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3688; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 3689; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 3690; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11 3691; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3692; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 3693; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3694; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 3695; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 3696; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3697; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 3698; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 3699; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3700; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 3701; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 3702; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 3703; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3704; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 3705; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 3706; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 3707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3708; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 3709; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 3710; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 3711; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 3712; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 3713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3714; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 3715; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00 3716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3717; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 3718; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 3719; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 3720; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4 3721; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3722; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo 3723; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo 3724; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 3725; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3726; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo 3727; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 3728; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo 3729; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3730; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0 3731; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 3732; GFX11-NEXT: s_setpc_b64 s[30:31] 3733 %fpround = fptrunc double %a to half 3734 %fneg = fneg half %fpround 3735 %insert.0 = insertvalue { half, half } poison, half %fneg, 0 3736 %insert.1 = insertvalue { half, half } %insert.0, half %fpround, 1 3737 ret { half, half } %insert.1 3738} 3739 3740; -------------------------------------------------------------------------------- 3741; ftrunc tests 3742; -------------------------------------------------------------------------------- 3743 3744define half @v_fneg_trunc_f16(half %a) #0 { 3745; SI-LABEL: v_fneg_trunc_f16: 3746; SI: ; %bb.0: 3747; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3748; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 3749; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3750; SI-NEXT: v_trunc_f32_e32 v0, v0 3751; SI-NEXT: s_setpc_b64 s[30:31] 3752; 3753; VI-LABEL: v_fneg_trunc_f16: 3754; VI: ; %bb.0: 3755; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3756; VI-NEXT: v_trunc_f16_e64 v0, -v0 3757; VI-NEXT: s_setpc_b64 s[30:31] 3758; 3759; GFX11-LABEL: v_fneg_trunc_f16: 3760; GFX11: ; %bb.0: 3761; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3762; GFX11-NEXT: v_trunc_f16_e64 v0, -v0 3763; GFX11-NEXT: s_setpc_b64 s[30:31] 3764 %trunc = call half @llvm.trunc.f16(half %a) 3765 %fneg = fneg half %trunc 3766 ret half %fneg 3767} 3768 3769; -------------------------------------------------------------------------------- 3770; fround tests 3771; -------------------------------------------------------------------------------- 3772 3773define half @v_fneg_round_f16(half %a) #0 { 3774; SI-SAFE-LABEL: v_fneg_round_f16: 3775; SI-SAFE: ; %bb.0: 3776; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3777; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 3778; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 3779; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0 3780; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1 3781; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 3782; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] 3783; SI-SAFE-NEXT: s_brev_b32 s4, -2 3784; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 3785; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0 3786; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 3787; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 3788; 3789; SI-NSZ-LABEL: v_fneg_round_f16: 3790; SI-NSZ: ; %bb.0: 3791; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3792; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 3793; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 3794; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0 3795; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1 3796; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 3797; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] 3798; SI-NSZ-NEXT: s_brev_b32 s4, -2 3799; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 3800; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0 3801; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 3802; 3803; VI-SAFE-LABEL: v_fneg_round_f16: 3804; VI-SAFE: ; %bb.0: 3805; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3806; VI-SAFE-NEXT: v_trunc_f16_e32 v1, v0 3807; VI-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1 3808; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x3c00 3809; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 3810; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc 3811; VI-SAFE-NEXT: s_movk_i32 s4, 0x7fff 3812; VI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 3813; VI-SAFE-NEXT: v_add_f16_e32 v0, v1, v0 3814; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 3815; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 3816; 3817; VI-NSZ-LABEL: v_fneg_round_f16: 3818; VI-NSZ: ; %bb.0: 3819; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3820; VI-NSZ-NEXT: v_trunc_f16_e32 v1, v0 3821; VI-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 3822; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x3c00 3823; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 3824; VI-NSZ-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc 3825; VI-NSZ-NEXT: s_movk_i32 s4, 0x7fff 3826; VI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 3827; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0 3828; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 3829; 3830; GFX11-SAFE-LABEL: v_fneg_round_f16: 3831; GFX11-SAFE: ; %bb.0: 3832; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3833; GFX11-SAFE-NEXT: v_trunc_f16_e32 v1, v0 3834; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3835; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1 3836; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 3837; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3838; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 3839; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 3840; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3841; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0 3842; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 3843; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 3844; 3845; GFX11-NSZ-LABEL: v_fneg_round_f16: 3846; GFX11-NSZ: ; %bb.0: 3847; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3848; GFX11-NSZ-NEXT: v_trunc_f16_e32 v1, v0 3849; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3850; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 3851; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 3852; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3853; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 3854; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 3855; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 3856; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0 3857; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 3858 %round = call half @llvm.round.f16(half %a) 3859 %fneg = fneg half %round 3860 ret half %fneg 3861} 3862 3863; -------------------------------------------------------------------------------- 3864; rint tests 3865; -------------------------------------------------------------------------------- 3866 3867define half @v_fneg_rint_f16(half %a) #0 { 3868; SI-LABEL: v_fneg_rint_f16: 3869; SI: ; %bb.0: 3870; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3871; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 3872; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3873; SI-NEXT: v_rndne_f32_e32 v0, v0 3874; SI-NEXT: s_setpc_b64 s[30:31] 3875; 3876; VI-LABEL: v_fneg_rint_f16: 3877; VI: ; %bb.0: 3878; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3879; VI-NEXT: v_rndne_f16_e64 v0, -v0 3880; VI-NEXT: s_setpc_b64 s[30:31] 3881; 3882; GFX11-LABEL: v_fneg_rint_f16: 3883; GFX11: ; %bb.0: 3884; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3885; GFX11-NEXT: v_rndne_f16_e64 v0, -v0 3886; GFX11-NEXT: s_setpc_b64 s[30:31] 3887 %rint = call half @llvm.rint.f16(half %a) 3888 %fneg = fneg half %rint 3889 ret half %fneg 3890} 3891 3892; -------------------------------------------------------------------------------- 3893; nearbyint tests 3894; -------------------------------------------------------------------------------- 3895 3896define half @v_fneg_nearbyint_f16(half %a) #0 { 3897; SI-LABEL: v_fneg_nearbyint_f16: 3898; SI: ; %bb.0: 3899; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3900; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 3901; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3902; SI-NEXT: v_rndne_f32_e32 v0, v0 3903; SI-NEXT: s_setpc_b64 s[30:31] 3904; 3905; VI-LABEL: v_fneg_nearbyint_f16: 3906; VI: ; %bb.0: 3907; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3908; VI-NEXT: v_rndne_f16_e64 v0, -v0 3909; VI-NEXT: s_setpc_b64 s[30:31] 3910; 3911; GFX11-LABEL: v_fneg_nearbyint_f16: 3912; GFX11: ; %bb.0: 3913; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3914; GFX11-NEXT: v_rndne_f16_e64 v0, -v0 3915; GFX11-NEXT: s_setpc_b64 s[30:31] 3916 %nearbyint = call half @llvm.nearbyint.f16(half %a) 3917 %fneg = fneg half %nearbyint 3918 ret half %fneg 3919} 3920 3921; -------------------------------------------------------------------------------- 3922; sin tests 3923; -------------------------------------------------------------------------------- 3924 3925define half @v_fneg_sin_f16(half %a) #0 { 3926; SI-LABEL: v_fneg_sin_f16: 3927; SI: ; %bb.0: 3928; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3929; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 3930; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3931; SI-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 3932; SI-NEXT: v_fract_f32_e32 v0, v0 3933; SI-NEXT: v_sin_f32_e32 v0, v0 3934; SI-NEXT: s_setpc_b64 s[30:31] 3935; 3936; VI-LABEL: v_fneg_sin_f16: 3937; VI: ; %bb.0: 3938; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3939; VI-NEXT: v_mul_f16_e32 v0, 0xb118, v0 3940; VI-NEXT: v_fract_f16_e32 v0, v0 3941; VI-NEXT: v_sin_f16_e32 v0, v0 3942; VI-NEXT: s_setpc_b64 s[30:31] 3943; 3944; GFX11-LABEL: v_fneg_sin_f16: 3945; GFX11: ; %bb.0: 3946; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3947; GFX11-NEXT: v_mul_f16_e32 v0, 0xb118, v0 3948; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3949; GFX11-NEXT: v_sin_f16_e32 v0, v0 3950; GFX11-NEXT: s_setpc_b64 s[30:31] 3951 %sin = call half @llvm.sin.f16(half %a) 3952 %fneg = fneg half %sin 3953 ret half %fneg 3954} 3955 3956; -------------------------------------------------------------------------------- 3957; fcanonicalize tests 3958; -------------------------------------------------------------------------------- 3959 3960define half @v_fneg_canonicalize_f16(half %a) #0 { 3961; SI-LABEL: v_fneg_canonicalize_f16: 3962; SI: ; %bb.0: 3963; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3964; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 3965; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 3966; SI-NEXT: s_setpc_b64 s[30:31] 3967; 3968; VI-LABEL: v_fneg_canonicalize_f16: 3969; VI: ; %bb.0: 3970; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3971; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 3972; VI-NEXT: s_setpc_b64 s[30:31] 3973; 3974; GFX11-LABEL: v_fneg_canonicalize_f16: 3975; GFX11: ; %bb.0: 3976; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3977; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 3978; GFX11-NEXT: s_setpc_b64 s[30:31] 3979 %trunc = call half @llvm.canonicalize.f16(half %a) 3980 %fneg = fneg half %trunc 3981 ret half %fneg 3982} 3983 3984; -------------------------------------------------------------------------------- 3985; CopyToReg tests 3986; -------------------------------------------------------------------------------- 3987 3988define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half %c, i32 %d) #0 { 3989; SI-LABEL: v_fneg_copytoreg_f16: 3990; SI: ; %bb.0: 3991; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3992; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 3993; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 3994; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31 3995; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 3996; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 3997; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 3998; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 3999; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4000; SI-NEXT: v_mul_f32_e32 v2, v2, v3 4001; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 4002; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc 4003; SI-NEXT: s_cbranch_execz .LBB81_2 4004; SI-NEXT: ; %bb.1: ; %if 4005; SI-NEXT: v_cvt_f16_f32_e64 v3, -v2 4006; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 4007; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 4008; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 4009; SI-NEXT: v_mul_f32_e32 v3, v3, v4 4010; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 4011; SI-NEXT: flat_store_short v[0:1], v3 4012; SI-NEXT: s_waitcnt vmcnt(0) 4013; SI-NEXT: .LBB81_2: ; %endif 4014; SI-NEXT: s_or_b64 exec, exec, s[4:5] 4015; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 4016; SI-NEXT: flat_store_short v[0:1], v2 4017; SI-NEXT: s_waitcnt vmcnt(0) 4018; SI-NEXT: s_setpc_b64 s[30:31] 4019; 4020; VI-LABEL: v_fneg_copytoreg_f16: 4021; VI: ; %bb.0: 4022; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4023; VI-NEXT: v_and_b32_e32 v6, 0x3ff, v31 4024; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 4025; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 4026; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4027; VI-NEXT: v_mul_f16_e32 v2, v2, v3 4028; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 4029; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc 4030; VI-NEXT: s_cbranch_execz .LBB81_2 4031; VI-NEXT: ; %bb.1: ; %if 4032; VI-NEXT: v_mul_f16_e64 v3, -v2, v4 4033; VI-NEXT: flat_store_short v[0:1], v3 4034; VI-NEXT: s_waitcnt vmcnt(0) 4035; VI-NEXT: .LBB81_2: ; %endif 4036; VI-NEXT: s_or_b64 exec, exec, s[4:5] 4037; VI-NEXT: flat_store_short v[0:1], v2 4038; VI-NEXT: s_waitcnt vmcnt(0) 4039; VI-NEXT: s_setpc_b64 s[30:31] 4040; 4041; GFX11-LABEL: v_fneg_copytoreg_f16: 4042; GFX11: ; %bb.0: 4043; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4044; GFX11-NEXT: v_and_b32_e32 v6, 0x3ff, v31 4045; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 4046; GFX11-NEXT: s_mov_b32 s0, exec_lo 4047; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 4048; GFX11-NEXT: v_lshlrev_b32_e32 v6, 1, v6 4049; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 4050; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 4051; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v5 4052; GFX11-NEXT: s_cbranch_execz .LBB81_2 4053; GFX11-NEXT: ; %bb.1: ; %if 4054; GFX11-NEXT: v_mul_f16_e64 v3, -v2, v4 4055; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc 4056; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4057; GFX11-NEXT: .LBB81_2: ; %endif 4058; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4059; GFX11-NEXT: global_store_b16 v[0:1], v2, off dlc 4060; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4061; GFX11-NEXT: s_setpc_b64 s[30:31] 4062 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4063 %tid.ext = sext i32 %tid to i64 4064 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 4065 %mul = fmul half %a, %b 4066 %fneg = fneg half %mul 4067 %cmp0 = icmp eq i32 %d, 0 4068 br i1 %cmp0, label %if, label %endif 4069 4070if: 4071 %mul1 = fmul half %fneg, %c 4072 store volatile half %mul1, ptr addrspace(1) %out.gep 4073 br label %endif 4074 4075endif: 4076 store volatile half %mul, ptr addrspace(1) %out.gep 4077 ret void 4078} 4079 4080; -------------------------------------------------------------------------------- 4081; inlineasm tests 4082; -------------------------------------------------------------------------------- 4083 4084; Can't fold into use, so should fold into source 4085define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 { 4086; SI-LABEL: v_fneg_inlineasm_f16: 4087; SI: ; %bb.0: 4088; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4089; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 4090; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 4091; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4092; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4093; SI-NEXT: v_mul_f32_e32 v0, v0, v1 4094; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 4095; SI-NEXT: ;;#ASMSTART 4096; SI-NEXT: ; use v1 4097; SI-NEXT: ;;#ASMEND 4098; SI-NEXT: s_setpc_b64 s[30:31] 4099; 4100; VI-LABEL: v_fneg_inlineasm_f16: 4101; VI: ; %bb.0: 4102; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4103; VI-NEXT: v_mul_f16_e64 v0, v0, -v1 4104; VI-NEXT: ;;#ASMSTART 4105; VI-NEXT: ; use v0 4106; VI-NEXT: ;;#ASMEND 4107; VI-NEXT: s_setpc_b64 s[30:31] 4108; 4109; GFX11-LABEL: v_fneg_inlineasm_f16: 4110; GFX11: ; %bb.0: 4111; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4112; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1 4113; GFX11-NEXT: ;;#ASMSTART 4114; GFX11-NEXT: ; use v0 4115; GFX11-NEXT: ;;#ASMEND 4116; GFX11-NEXT: s_setpc_b64 s[30:31] 4117 %mul = fmul half %a, %b 4118 %fneg = fneg half %mul 4119 call void asm sideeffect "; use $0", "v"(half %fneg) #0 4120 ret half %fneg 4121} 4122 4123; -------------------------------------------------------------------------------- 4124; inlineasm tests 4125; -------------------------------------------------------------------------------- 4126 4127; Can't fold into use, so should fold into source 4128define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a, half %b, half %c, i32 %d) #0 { 4129; SI-LABEL: v_fneg_inlineasm_multi_use_src_f16: 4130; SI: ; %bb.0: 4131; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4132; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 4133; SI-NEXT: v_cvt_f16_f32_e64 v1, -v3 4134; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4135; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 4136; SI-NEXT: v_mul_f32_e32 v0, v0, v1 4137; SI-NEXT: v_cvt_f16_f32_e64 v1, -v0 4138; SI-NEXT: ;;#ASMSTART 4139; SI-NEXT: ; use v1 4140; SI-NEXT: ;;#ASMEND 4141; SI-NEXT: s_setpc_b64 s[30:31] 4142; 4143; VI-LABEL: v_fneg_inlineasm_multi_use_src_f16: 4144; VI: ; %bb.0: 4145; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4146; VI-NEXT: v_mul_f16_e32 v0, v2, v3 4147; VI-NEXT: v_xor_b32_e32 v1, 0x8000, v0 4148; VI-NEXT: ;;#ASMSTART 4149; VI-NEXT: ; use v1 4150; VI-NEXT: ;;#ASMEND 4151; VI-NEXT: s_setpc_b64 s[30:31] 4152; 4153; GFX11-LABEL: v_fneg_inlineasm_multi_use_src_f16: 4154; GFX11: ; %bb.0: 4155; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4156; GFX11-NEXT: v_mul_f16_e32 v0, v2, v3 4157; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4158; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0 4159; GFX11-NEXT: ;;#ASMSTART 4160; GFX11-NEXT: ; use v1 4161; GFX11-NEXT: ;;#ASMEND 4162; GFX11-NEXT: s_setpc_b64 s[30:31] 4163 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4164 %tid.ext = sext i32 %tid to i64 4165 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 4166 %mul = fmul half %a, %b 4167 %fneg = fneg half %mul 4168 call void asm sideeffect "; use $0", "v"(half %fneg) #0 4169 ret half %mul 4170} 4171 4172; -------------------------------------------------------------------------------- 4173; code size regression tests 4174; -------------------------------------------------------------------------------- 4175 4176; There are multiple users of the fneg that must use a VOP3 4177; instruction, so there is no penalty 4178define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c) #0 { 4179; SI-LABEL: multiuse_fneg_2_vop3_users_f16: 4180; SI: ; %bb.0: 4181; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4182; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 4183; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4184; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 4185; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4186; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4187; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 4188; SI-NEXT: v_fma_f32 v0, -v3, v1, v2 4189; SI-NEXT: v_fma_f32 v1, -v3, v2, 2.0 4190; SI-NEXT: s_setpc_b64 s[30:31] 4191; 4192; VI-LABEL: multiuse_fneg_2_vop3_users_f16: 4193; VI: ; %bb.0: 4194; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4195; VI-NEXT: v_fma_f16 v3, -v0, v1, v2 4196; VI-NEXT: v_fma_f16 v1, -v0, v2, 2.0 4197; VI-NEXT: v_mov_b32_e32 v0, v3 4198; VI-NEXT: s_setpc_b64 s[30:31] 4199; 4200; GFX11-LABEL: multiuse_fneg_2_vop3_users_f16: 4201; GFX11: ; %bb.0: 4202; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4203; GFX11-NEXT: v_fma_f16 v3, -v0, v1, v2 4204; GFX11-NEXT: v_fma_f16 v1, -v0, v2, 2.0 4205; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 4206; GFX11-NEXT: v_mov_b32_e32 v0, v3 4207; GFX11-NEXT: s_setpc_b64 s[30:31] 4208 %fneg.a = fneg half %a 4209 %fma0 = call half @llvm.fma.f16(half %fneg.a, half %b, half %c) 4210 %fma1 = call half @llvm.fma.f16(half %fneg.a, half %c, half 2.0) 4211 %insert.0 = insertvalue { half, half } poison, half %fma0, 0 4212 %insert.1 = insertvalue { half, half } %insert.0, half %fma1, 1 4213 ret { half, half } %insert.1 4214} 4215 4216; There are multiple users, but both require using a larger encoding 4217; for the modifier. 4218define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c) #0 { 4219; SI-LABEL: multiuse_fneg_2_vop2_users_f16: 4220; SI: ; %bb.0: 4221; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4222; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4223; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 4224; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 4225; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4226; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 4227; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4228; SI-NEXT: v_mul_f32_e32 v0, v3, v1 4229; SI-NEXT: v_mul_f32_e32 v1, v3, v2 4230; SI-NEXT: s_setpc_b64 s[30:31] 4231; 4232; VI-LABEL: multiuse_fneg_2_vop2_users_f16: 4233; VI: ; %bb.0: 4234; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4235; VI-NEXT: v_mul_f16_e64 v3, -v0, v1 4236; VI-NEXT: v_mul_f16_e64 v1, -v0, v2 4237; VI-NEXT: v_mov_b32_e32 v0, v3 4238; VI-NEXT: s_setpc_b64 s[30:31] 4239; 4240; GFX11-LABEL: multiuse_fneg_2_vop2_users_f16: 4241; GFX11: ; %bb.0: 4242; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4243; GFX11-NEXT: v_mul_f16_e64 v3, -v0, v1 4244; GFX11-NEXT: v_mul_f16_e64 v1, -v0, v2 4245; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 4246; GFX11-NEXT: v_mov_b32_e32 v0, v3 4247; GFX11-NEXT: s_setpc_b64 s[30:31] 4248 %fneg.a = fneg half %a 4249 %mul0 = fmul half %fneg.a, %b 4250 %mul1 = fmul half %fneg.a, %c 4251 %insert.0 = insertvalue { half, half } poison, half %mul0, 0 4252 %insert.1 = insertvalue { half, half } %insert.0, half %mul1, 1 4253 ret { half, half } %insert.1 4254} 4255 4256; One user is VOP3 so has no cost to folding the modifier, the other does. 4257define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out, half %a, half %b, half %c) #0 { 4258; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16: 4259; SI: ; %bb.0: 4260; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4261; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 4262; SI-NEXT: v_cvt_f16_f32_e64 v1, -v2 4263; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 4264; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4265; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4266; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4267; SI-NEXT: v_fma_f32 v0, v1, v0, 2.0 4268; SI-NEXT: v_mul_f32_e32 v1, v1, v2 4269; SI-NEXT: s_setpc_b64 s[30:31] 4270; 4271; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16: 4272; VI: ; %bb.0: 4273; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4274; VI-NEXT: v_fma_f16 v0, -v2, v3, 2.0 4275; VI-NEXT: v_mul_f16_e64 v1, -v2, v4 4276; VI-NEXT: s_setpc_b64 s[30:31] 4277; 4278; GFX11-LABEL: multiuse_fneg_vop2_vop3_users_f16: 4279; GFX11: ; %bb.0: 4280; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4281; GFX11-NEXT: v_fma_f16 v0, -v2, v3, 2.0 4282; GFX11-NEXT: v_mul_f16_e64 v1, -v2, v4 4283; GFX11-NEXT: s_setpc_b64 s[30:31] 4284 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4285 %tid.ext = sext i32 %tid to i64 4286 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 4287 4288 %fneg.a = fneg half %a 4289 %fma0 = call half @llvm.fma.f16(half %fneg.a, half %b, half 2.0) 4290 %mul1 = fmul half %fneg.a, %c 4291 4292 %insert.0 = insertvalue { half, half } poison, half %fma0, 0 4293 %insert.1 = insertvalue { half, half } %insert.0, half %mul1, 1 4294 ret { half, half } %insert.1 4295} 4296 4297; The use of the fneg requires a code size increase, but folding into 4298; the source does not 4299define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 { 4300; SI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: 4301; SI-SAFE: ; %bb.0: 4302; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4303; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v3 4304; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 4305; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v4 4306; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v5 4307; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 4308; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 4309; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 4310; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 4311; SI-SAFE-NEXT: v_fma_f32 v1, v1, v0, 2.0 4312; SI-SAFE-NEXT: v_mul_f32_e64 v0, -v1, v2 4313; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 4314; SI-SAFE-NEXT: s_setpc_b64 s[30:31] 4315; 4316; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16: 4317; SI-NSZ: ; %bb.0: 4318; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4319; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v3 4320; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v2 4321; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v4 4322; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v5 4323; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 4324; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 4325; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 4326; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 4327; SI-NSZ-NEXT: v_fma_f32 v1, v1, -v0, -2.0 4328; SI-NSZ-NEXT: v_mul_f32_e32 v0, v1, v2 4329; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v3 4330; SI-NSZ-NEXT: s_setpc_b64 s[30:31] 4331; 4332; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: 4333; VI-SAFE: ; %bb.0: 4334; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4335; VI-SAFE-NEXT: v_fma_f16 v1, v2, v3, 2.0 4336; VI-SAFE-NEXT: v_mul_f16_e64 v0, -v1, v4 4337; VI-SAFE-NEXT: v_mul_f16_e64 v1, -v1, v5 4338; VI-SAFE-NEXT: s_setpc_b64 s[30:31] 4339; 4340; VI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16: 4341; VI-NSZ: ; %bb.0: 4342; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4343; VI-NSZ-NEXT: v_fma_f16 v1, v2, -v3, -2.0 4344; VI-NSZ-NEXT: v_mul_f16_e32 v0, v1, v4 4345; VI-NSZ-NEXT: v_mul_f16_e32 v1, v1, v5 4346; VI-NSZ-NEXT: s_setpc_b64 s[30:31] 4347; 4348; GFX11-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: 4349; GFX11-SAFE: ; %bb.0: 4350; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4351; GFX11-SAFE-NEXT: v_fma_f16 v1, v2, v3, 2.0 4352; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4353; GFX11-SAFE-NEXT: v_mul_f16_e64 v0, -v1, v4 4354; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v1, v5 4355; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] 4356; 4357; GFX11-NSZ-LABEL: free_fold_src_code_size_cost_use_f16: 4358; GFX11-NSZ: ; %bb.0: 4359; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4360; GFX11-NSZ-NEXT: v_fma_f16 v1, v2, -v3, -2.0 4361; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) 4362; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, v1, v4 4363; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, v1, v5 4364; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] 4365 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4366 %tid.ext = sext i32 %tid to i64 4367 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 4368 4369 %fma0 = call half @llvm.fma.f16(half %a, half %b, half 2.0) 4370 %fneg.fma0 = fneg half %fma0 4371 %mul1 = fmul half %fneg.fma0, %c 4372 %mul2 = fmul half %fneg.fma0, %d 4373 4374 %insert.0 = insertvalue { half, half } poison, half %mul1, 0 4375 %insert.1 = insertvalue { half, half } %insert.0, half %mul2, 1 4376 ret { half, half } %insert.1 4377} 4378 4379; %trunc.a has one fneg use, but it requires a code size increase and 4380; %the fneg can instead be folded for free into the fma. 4381define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 { 4382; SI-LABEL: one_use_cost_to_fold_into_src_f16: 4383; SI: ; %bb.0: 4384; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4385; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 4386; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 4387; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 4388; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4389; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4390; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4391; SI-NEXT: v_trunc_f32_e32 v1, v1 4392; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 4393; SI-NEXT: s_setpc_b64 s[30:31] 4394; 4395; VI-LABEL: one_use_cost_to_fold_into_src_f16: 4396; VI: ; %bb.0: 4397; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4398; VI-NEXT: v_trunc_f16_e32 v0, v2 4399; VI-NEXT: v_fma_f16 v0, -v0, v3, v4 4400; VI-NEXT: s_setpc_b64 s[30:31] 4401; 4402; GFX11-LABEL: one_use_cost_to_fold_into_src_f16: 4403; GFX11: ; %bb.0: 4404; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4405; GFX11-NEXT: v_trunc_f16_e32 v0, v2 4406; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4407; GFX11-NEXT: v_fma_f16 v0, -v0, v3, v4 4408; GFX11-NEXT: s_setpc_b64 s[30:31] 4409 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4410 %tid.ext = sext i32 %tid to i64 4411 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 4412 4413 %trunc.a = call half @llvm.trunc.f16(half %a) 4414 %trunc.fneg.a = fneg half %trunc.a 4415 %fma0 = call half @llvm.fma.f16(half %trunc.fneg.a, half %b, half %c) 4416 ret half %fma0 4417} 4418 4419define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 { 4420; SI-LABEL: multi_use_cost_to_fold_into_src: 4421; SI: ; %bb.0: 4422; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4423; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 4424; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 4425; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 4426; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 4427; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4428; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4429; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4430; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 4431; SI-NEXT: v_trunc_f32_e32 v1, v1 4432; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 4433; SI-NEXT: v_mul_f32_e32 v1, v1, v3 4434; SI-NEXT: s_setpc_b64 s[30:31] 4435; 4436; VI-LABEL: multi_use_cost_to_fold_into_src: 4437; VI: ; %bb.0: 4438; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4439; VI-NEXT: v_trunc_f16_e32 v1, v2 4440; VI-NEXT: v_fma_f16 v0, -v1, v3, v4 4441; VI-NEXT: v_mul_f16_e32 v1, v1, v5 4442; VI-NEXT: s_setpc_b64 s[30:31] 4443; 4444; GFX11-LABEL: multi_use_cost_to_fold_into_src: 4445; GFX11: ; %bb.0: 4446; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4447; GFX11-NEXT: v_trunc_f16_e32 v1, v2 4448; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4449; GFX11-NEXT: v_fma_f16 v0, -v1, v3, v4 4450; GFX11-NEXT: v_mul_f16_e32 v1, v1, v5 4451; GFX11-NEXT: s_setpc_b64 s[30:31] 4452 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4453 %tid.ext = sext i32 %tid to i64 4454 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 4455 %trunc.a = call half @llvm.trunc.f16(half %a) 4456 %trunc.fneg.a = fneg half %trunc.a 4457 %fma0 = call half @llvm.fma.f16(half %trunc.fneg.a, half %b, half %c) 4458 %mul1 = fmul half %trunc.a, %d 4459 %insert.0 = insertvalue { half, half } poison, half %fma0, 0 4460 %insert.1 = insertvalue { half, half } %insert.0, half %mul1, 1 4461 ret { half, half } %insert.1 4462} 4463 4464; The AMDGPU combine to pull fneg into the FMA operands was being 4465; undone by the generic combine to pull the fneg out of the fma if 4466; !isFNegFree. We were reporting false for v2f32 even though it will 4467; be split into f32 where it will be free. 4468define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %arg1, <2 x half> %arg2) #0 { 4469; SI-LABEL: fneg_fma_fneg_dagcombine_loop: 4470; SI: ; %bb.0: ; %bb 4471; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4472; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 4473; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 4474; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 4475; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 4476; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 4477; SI-NEXT: v_or_b32_e32 v6, v4, v6 4478; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 4479; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4480; SI-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 4481; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 4482; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4483; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 4484; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 4485; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 4486; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4487; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4488; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 4489; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 4490; SI-NEXT: s_brev_b32 s4, 1 4491; SI-NEXT: v_fma_f32 v3, v3, v7, s4 4492; SI-NEXT: v_fma_f32 v2, v2, v6, s4 4493; SI-NEXT: v_sub_f32_e32 v1, v3, v1 4494; SI-NEXT: v_sub_f32_e32 v0, v2, v0 4495; SI-NEXT: v_mul_f32_e32 v0, v0, v4 4496; SI-NEXT: v_mul_f32_e32 v1, v1, v5 4497; SI-NEXT: s_setpc_b64 s[30:31] 4498; 4499; VI-LABEL: fneg_fma_fneg_dagcombine_loop: 4500; VI: ; %bb.0: ; %bb 4501; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4502; VI-NEXT: s_mov_b32 s4, 0x8000 4503; VI-NEXT: v_fma_f16 v3, v1, -v2, s4 4504; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4505; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 4506; VI-NEXT: v_fma_f16 v1, v1, -v4, s4 4507; VI-NEXT: v_sub_f16_e32 v3, v3, v0 4508; VI-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4509; VI-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4510; VI-NEXT: v_mul_f16_e32 v1, v3, v2 4511; VI-NEXT: v_or_b32_e32 v0, v1, v0 4512; VI-NEXT: s_setpc_b64 s[30:31] 4513; 4514; GFX11-LABEL: fneg_fma_fneg_dagcombine_loop: 4515; GFX11: ; %bb.0: ; %bb 4516; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4517; GFX11-NEXT: v_pk_fma_f16 v1, v1, v2, 0x8000 op_sel_hi:[1,1,0] neg_lo:[0,1,0] neg_hi:[0,1,0] 4518; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4519; GFX11-NEXT: v_pk_add_f16 v0, v1, v0 neg_lo:[0,1] neg_hi:[0,1] 4520; GFX11-NEXT: v_pk_mul_f16 v0, v0, v2 4521; GFX11-NEXT: s_setpc_b64 s[30:31] 4522bb: 4523 %i3 = call fast <2 x half> @llvm.fma.v2f16(<2 x half> %arg1, <2 x half> %arg2, <2 x half> zeroinitializer) 4524 %i4 = fadd fast <2 x half> %i3, %arg 4525 %i5 = fneg <2 x half> %i4 4526 %i6 = fmul fast <2 x half> %i5, %arg2 4527 ret <2 x half> %i6 4528} 4529 4530; This expects denormal flushing, so can't turn this fmul into fneg 4531; TODO: Keeping this as fmul saves encoding size 4532define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 { 4533; SI-LABEL: nnan_fmul_neg1_to_fneg: 4534; SI: ; %bb.0: 4535; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4536; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4537; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 4538; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4539; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4540; SI-NEXT: v_mul_f32_e32 v0, v0, v1 4541; SI-NEXT: s_setpc_b64 s[30:31] 4542; 4543; VI-LABEL: nnan_fmul_neg1_to_fneg: 4544; VI: ; %bb.0: 4545; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4546; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 4547; VI-NEXT: s_setpc_b64 s[30:31] 4548; 4549; GFX11-LABEL: nnan_fmul_neg1_to_fneg: 4550; GFX11: ; %bb.0: 4551; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4552; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 4553; GFX11-NEXT: s_setpc_b64 s[30:31] 4554 %mul = fmul half %x, -1.0 4555 %add = fmul nnan half %mul, %y 4556 ret half %add 4557} 4558 4559; It's legal to turn this fmul into an fneg since denormals are 4560; preserved and we know an snan can't happen from the flag. 4561define half @denormal_fmul_neg1_to_fneg(half %x, half %y) { 4562; SI-LABEL: denormal_fmul_neg1_to_fneg: 4563; SI: ; %bb.0: 4564; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4565; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4566; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 4567; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4568; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4569; SI-NEXT: v_mul_f32_e32 v0, v0, v1 4570; SI-NEXT: s_setpc_b64 s[30:31] 4571; 4572; VI-LABEL: denormal_fmul_neg1_to_fneg: 4573; VI: ; %bb.0: 4574; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4575; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 4576; VI-NEXT: s_setpc_b64 s[30:31] 4577; 4578; GFX11-LABEL: denormal_fmul_neg1_to_fneg: 4579; GFX11: ; %bb.0: 4580; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4581; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 4582; GFX11-NEXT: s_setpc_b64 s[30:31] 4583 %mul = fmul nnan half %x, -1.0 4584 %add = fmul half %mul, %y 4585 ret half %add 4586} 4587 4588; know the source can't be an snan 4589define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) { 4590; SI-LABEL: denorm_snan_fmul_neg1_to_fneg: 4591; SI: ; %bb.0: 4592; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4593; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 4594; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4595; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 4596; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 4597; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4598; SI-NEXT: v_mul_f32_e32 v0, v2, v0 4599; SI-NEXT: v_mul_f32_e32 v0, v0, v1 4600; SI-NEXT: s_setpc_b64 s[30:31] 4601; 4602; VI-LABEL: denorm_snan_fmul_neg1_to_fneg: 4603; VI: ; %bb.0: 4604; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4605; VI-NEXT: v_mul_f16_e64 v0, v0, -v0 4606; VI-NEXT: v_mul_f16_e32 v0, v0, v1 4607; VI-NEXT: s_setpc_b64 s[30:31] 4608; 4609; GFX11-LABEL: denorm_snan_fmul_neg1_to_fneg: 4610; GFX11: ; %bb.0: 4611; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4612; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v0 4613; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4614; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 4615; GFX11-NEXT: s_setpc_b64 s[30:31] 4616 %canonical = fmul half %x, %x 4617 %mul = fmul half %canonical, -1.0 4618 %add = fmul half %mul, %y 4619 ret half %add 4620} 4621 4622define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 { 4623; SI-LABEL: flush_snan_fmul_neg1_to_fneg: 4624; SI: ; %bb.0: 4625; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4626; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4627; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 4628; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4629; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4630; SI-NEXT: v_mul_f32_e32 v0, v0, v1 4631; SI-NEXT: s_setpc_b64 s[30:31] 4632; 4633; VI-LABEL: flush_snan_fmul_neg1_to_fneg: 4634; VI: ; %bb.0: 4635; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4636; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 4637; VI-NEXT: v_mul_f16_e32 v0, v0, v1 4638; VI-NEXT: s_setpc_b64 s[30:31] 4639; 4640; GFX11-LABEL: flush_snan_fmul_neg1_to_fneg: 4641; GFX11: ; %bb.0: 4642; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4643; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 4644; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4645; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 4646; GFX11-NEXT: s_setpc_b64 s[30:31] 4647 %quiet = call half @llvm.canonicalize.f16(half %x) 4648 %mul = fmul half %quiet, -1.0 4649 %add = fmul half %mul, %y 4650 ret half %add 4651} 4652 4653define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { 4654; SI-LABEL: fadd_select_fneg_fneg_f16: 4655; SI: ; %bb.0: 4656; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4657; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4658; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 4659; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 4660; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4661; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4662; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 4663; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 4664; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4665; SI-NEXT: v_sub_f32_e32 v0, v3, v0 4666; SI-NEXT: s_setpc_b64 s[30:31] 4667; 4668; VI-LABEL: fadd_select_fneg_fneg_f16: 4669; VI: ; %bb.0: 4670; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4671; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4672; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4673; VI-NEXT: v_sub_f16_e32 v0, v3, v0 4674; VI-NEXT: s_setpc_b64 s[30:31] 4675; 4676; GFX11-LABEL: fadd_select_fneg_fneg_f16: 4677; GFX11: ; %bb.0: 4678; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4679; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4680; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 4681; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4682; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 4683; GFX11-NEXT: s_setpc_b64 s[30:31] 4684 %cmp = icmp eq i32 %arg0, 0 4685 %neg.x = fneg half %x 4686 %neg.y = fneg half %y 4687 %select = select i1 %cmp, half %neg.x, half %neg.y 4688 %add = fadd half %select, %z 4689 ret half %add 4690} 4691 4692; FIXME: Terrible code for SI 4693define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) { 4694; SI-LABEL: fadd_select_fneg_fneg_v2f16: 4695; SI: ; %bb.0: 4696; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4697; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 4698; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 4699; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 4700; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 4701; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4702; SI-NEXT: v_or_b32_e32 v1, v1, v2 4703; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 4704; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 4705; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4706; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4707; SI-NEXT: v_or_b32_e32 v2, v3, v2 4708; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4709; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 4710; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 4711; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 4712; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 4713; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 4714; SI-NEXT: v_sub_f32_e32 v0, v4, v0 4715; SI-NEXT: v_sub_f32_e32 v1, v3, v1 4716; SI-NEXT: s_setpc_b64 s[30:31] 4717; 4718; VI-LABEL: fadd_select_fneg_fneg_v2f16: 4719; VI: ; %bb.0: 4720; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4721; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4722; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4723; VI-NEXT: v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 4724; VI-NEXT: v_sub_f16_e32 v0, v3, v0 4725; VI-NEXT: v_or_b32_e32 v0, v0, v1 4726; VI-NEXT: s_setpc_b64 s[30:31] 4727; 4728; GFX11-LABEL: fadd_select_fneg_fneg_v2f16: 4729; GFX11: ; %bb.0: 4730; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4731; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4732; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 4733; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4734; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] 4735; GFX11-NEXT: s_setpc_b64 s[30:31] 4736 %cmp = icmp eq i32 %arg0, 0 4737 %neg.x = fneg <2 x half> %x 4738 %neg.y = fneg <2 x half> %y 4739 %select = select i1 %cmp, <2 x half> %neg.x, <2 x half> %neg.y 4740 %add = fadd <2 x half> %select, %z 4741 ret <2 x half> %add 4742} 4743 4744declare i32 @llvm.amdgcn.workitem.id.x() #1 4745declare half @llvm.sin.f16(half) #1 4746declare half @llvm.trunc.f16(half) #1 4747declare half @llvm.round.f16(half) #1 4748declare half @llvm.rint.f16(half) #1 4749declare half @llvm.nearbyint.f16(half) #1 4750declare half @llvm.roundeven.f16(half) #1 4751declare half @llvm.canonicalize.f16(half) #1 4752declare half @llvm.minnum.f16(half, half) #1 4753declare half @llvm.maxnum.f16(half, half) #1 4754declare half @llvm.fma.f16(half, half, half) #1 4755declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) 4756declare half @llvm.fmuladd.f16(half, half, half) #1 4757declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1 4758 4759attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 4760attributes #1 = { nounwind readnone } 4761attributes #2 = { nounwind "unsafe-fp-math"="true" } 4762attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } 4763attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 4764