1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s 4 5; fold (fsub (fpext (fneg (fmul, x, y))), z) -> (fneg (fma (fpext x), (fpext y), z)) 6define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul(half %x, half %y, float %z) { 7; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul: 8; GFX9-DENORM: ; %bb.0: ; %entry 9; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] 10; GFX9-DENORM-NEXT: ; return to shader part epilog 11; 12; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul: 13; GFX10-DENORM: ; %bb.0: ; %entry 14; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] 15; GFX10-DENORM-NEXT: ; return to shader part epilog 16entry: 17 %a = fmul fast half %x, %y 18 %b = fneg half %a 19 %c = fpext half %b to float 20 %d = fsub fast float %c, %z 21 ret float %d 22} 23 24; fold (fsub (fneg (fpext (fmul, x, y))), z) -> (fneg (fma (fpext x)), (fpext y), z) 25define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul(half %x, half %y, float %z) { 26; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul: 27; GFX9-DENORM: ; %bb.0: ; %entry 28; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] 29; GFX9-DENORM-NEXT: ; return to shader part epilog 30; 31; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul: 32; GFX10-DENORM: ; %bb.0: ; %entry 33; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] 34; GFX10-DENORM-NEXT: ; return to shader part epilog 35entry: 36 %a = fmul fast half %x, %y 37 %b = fpext half %a to float 38 %c = fneg float %b 39 %d = fsub fast float %c, %z 40 ret float %d 41} 42 43 44; fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x) 45define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul2(float %x, half %y, half %z) { 46; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2: 47; GFX9-DENORM: ; %bb.0: ; %entry 48; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] 49; GFX9-DENORM-NEXT: ; return to shader part epilog 50; 51; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2: 52; GFX10-DENORM: ; %bb.0: ; %entry 53; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] 54; GFX10-DENORM-NEXT: ; return to shader part epilog 55entry: 56 %a = fmul fast half %y, %z 57 %b = fneg half %a 58 %c = fpext half %b to float 59 %d = fsub fast float %x, %c 60 ret float %d 61} 62 63; fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x) 64define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul2(float %x, half %y, half %z) { 65; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2: 66; GFX9-DENORM: ; %bb.0: ; %entry 67; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] 68; GFX9-DENORM-NEXT: ; return to shader part epilog 69; 70; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2: 71; GFX10-DENORM: ; %bb.0: ; %entry 72; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] 73; GFX10-DENORM-NEXT: ; return to shader part epilog 74entry: 75 %a = fmul fast half %y, %z 76 %b = fpext half %a to float 77 %c = fneg float %b 78 %d = fsub fast float %x, %c 79 ret float %d 80} 81 82; fold (fsub (fpext (fneg (fmul, x, y))), z) -> (fneg (fma (fpext x), (fpext y), z)) 83define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x float> %z) { 84; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: 85; GFX9-DENORM: ; %bb.0: ; %entry 86; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] 87; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] 88; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0 89; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 90; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1 91; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 92; GFX9-DENORM-NEXT: v_sub_f32_e32 v0, v2, v4 93; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v3, v5 94; GFX9-DENORM-NEXT: v_sub_f32_e32 v2, v8, v6 95; GFX9-DENORM-NEXT: v_sub_f32_e32 v3, v9, v7 96; GFX9-DENORM-NEXT: ; return to shader part epilog 97; 98; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: 99; GFX10-DENORM: ; %bb.0: ; %entry 100; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v2 101; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v3 102; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] 103; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] 104; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0] 105; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0] 106; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 107; GFX10-DENORM-NEXT: ; return to shader part epilog 108entry: 109 %a = fmul fast <4 x half> %x, %y 110 %b = fneg <4 x half> %a 111 %c = fpext <4 x half> %b to <4 x float> 112 %d = fsub fast <4 x float> %c, %z 113 ret <4 x float> %d 114} 115 116; fold (fsub (fneg (fpext (fmul, x, y))), z) -> (fneg (fma (fpext x)), (fpext y), z) 117define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul(<4 x half> %x, <4 x half> %y, <4 x float> %z) { 118; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: 119; GFX9-DENORM: ; %bb.0: ; %entry 120; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] 121; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] 122; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0 123; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 124; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1 125; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 126; GFX9-DENORM-NEXT: v_sub_f32_e32 v0, v2, v4 127; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v3, v5 128; GFX9-DENORM-NEXT: v_sub_f32_e32 v2, v8, v6 129; GFX9-DENORM-NEXT: v_sub_f32_e32 v3, v9, v7 130; GFX9-DENORM-NEXT: ; return to shader part epilog 131; 132; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: 133; GFX10-DENORM: ; %bb.0: ; %entry 134; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v2 135; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v3 136; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] 137; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] 138; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0] 139; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0] 140; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 141; GFX10-DENORM-NEXT: ; return to shader part epilog 142entry: 143 %a = fmul fast <4 x half> %x, %y 144 %b = fpext <4 x half> %a to <4 x float> 145 %c = fneg <4 x float> %b 146 %d = fsub fast <4 x float> %c, %z 147 ret <4 x float> %d 148} 149 150 151; fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x) 152define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul2(<4 x float> %x, <4 x half> %y, <4 x half> %z) { 153; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: 154; GFX9-DENORM: ; %bb.0: ; %entry 155; GFX9-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6 neg_lo:[0,1] neg_hi:[0,1] 156; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 neg_lo:[0,1] neg_hi:[0,1] 157; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4 158; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 159; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5 160; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 161; GFX9-DENORM-NEXT: v_sub_f32_e32 v0, v0, v6 162; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v4 163; GFX9-DENORM-NEXT: v_sub_f32_e32 v2, v2, v7 164; GFX9-DENORM-NEXT: v_sub_f32_e32 v3, v3, v5 165; GFX9-DENORM-NEXT: ; return to shader part epilog 166; 167; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: 168; GFX10-DENORM: ; %bb.0: ; %entry 169; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v6 170; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v7 171; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] 172; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] 173; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0] 174; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0] 175; GFX10-DENORM-NEXT: ; return to shader part epilog 176entry: 177 %a = fmul fast <4 x half> %y, %z 178 %b = fneg <4 x half> %a 179 %c = fpext <4 x half> %b to <4 x float> 180 %d = fsub fast <4 x float> %x, %c 181 ret <4 x float> %d 182} 183 184; fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x) 185define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul2(<4 x float> %x, <4 x half> %y, <4 x half> %z) { 186; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: 187; GFX9-DENORM: ; %bb.0: ; %entry 188; GFX9-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6 neg_lo:[0,1] neg_hi:[0,1] 189; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 neg_lo:[0,1] neg_hi:[0,1] 190; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4 191; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 192; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5 193; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 194; GFX9-DENORM-NEXT: v_sub_f32_e32 v0, v0, v6 195; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v4 196; GFX9-DENORM-NEXT: v_sub_f32_e32 v2, v2, v7 197; GFX9-DENORM-NEXT: v_sub_f32_e32 v3, v3, v5 198; GFX9-DENORM-NEXT: ; return to shader part epilog 199; 200; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: 201; GFX10-DENORM: ; %bb.0: ; %entry 202; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v6 203; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v7 204; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] 205; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] 206; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0] 207; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0] 208; GFX10-DENORM-NEXT: ; return to shader part epilog 209entry: 210 %a = fmul fast <4 x half> %y, %z 211 %b = fpext <4 x half> %a to <4 x float> 212 %c = fneg <4 x float> %b 213 %d = fsub fast <4 x float> %x, %c 214 ret <4 x float> %d 215} 216