1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64 4 5define float @f1(float %a, float %b, float %c) { 6; X86-LABEL: f1: 7; X86: # %bb.0: 8; X86-NEXT: pushl %eax 9; X86-NEXT: .cfi_def_cfa_offset 8 10; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 11; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 12; X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem 13; X86-NEXT: vmovss %xmm1, (%esp) 14; X86-NEXT: flds (%esp) 15; X86-NEXT: popl %eax 16; X86-NEXT: .cfi_def_cfa_offset 4 17; X86-NEXT: retl 18; 19; X64-LABEL: f1: 20; X64: # %bb.0: 21; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 22; X64-NEXT: retq 23 %mul = fmul fast float %b, %a 24 %add = fadd fast float %mul, %c 25 ret float %add 26} 27 28define float @f2(float %a, float %b, float %c) { 29; X86-LABEL: f2: 30; X86: # %bb.0: 31; X86-NEXT: pushl %eax 32; X86-NEXT: .cfi_def_cfa_offset 8 33; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 34; X86-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0 35; X86-NEXT: #ARITH_FENCE 36; X86-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0 37; X86-NEXT: vmovss %xmm0, (%esp) 38; X86-NEXT: flds (%esp) 39; X86-NEXT: popl %eax 40; X86-NEXT: .cfi_def_cfa_offset 4 41; X86-NEXT: retl 42; 43; X64-LABEL: f2: 44; X64: # %bb.0: 45; X64-NEXT: vmulss %xmm0, %xmm1, %xmm0 46; X64-NEXT: #ARITH_FENCE 47; X64-NEXT: vaddss %xmm2, %xmm0, %xmm0 48; X64-NEXT: retq 49 %mul = fmul fast float %b, %a 50 %tmp = call float @llvm.arithmetic.fence.f32(float %mul) 51 %add = fadd fast float %tmp, %c 52 ret float %add 53} 54 55define double @f3(double %a) { 56; X86-LABEL: f3: 57; X86: # %bb.0: 58; X86-NEXT: pushl %ebp 59; X86-NEXT: .cfi_def_cfa_offset 8 60; X86-NEXT: .cfi_offset %ebp, -8 61; X86-NEXT: movl %esp, %ebp 62; X86-NEXT: .cfi_def_cfa_register %ebp 63; X86-NEXT: andl $-8, %esp 64; X86-NEXT: subl $8, %esp 65; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 66; X86-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 67; X86-NEXT: vmovsd %xmm0, (%esp) 68; X86-NEXT: fldl (%esp) 69; X86-NEXT: movl %ebp, %esp 70; X86-NEXT: popl %ebp 71; X86-NEXT: .cfi_def_cfa %esp, 4 72; X86-NEXT: retl 73; 74; X64-LABEL: f3: 75; X64: # %bb.0: 76; X64-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 77; X64-NEXT: retq 78 %1 = fadd fast double %a, %a 79 %2 = fadd fast double %a, %a 80 %3 = fadd fast double %1, %2 81 ret double %3 82} 83 84define double @f4(double %a) { 85; X86-LABEL: f4: 86; X86: # %bb.0: 87; X86-NEXT: pushl %ebp 88; X86-NEXT: .cfi_def_cfa_offset 8 89; X86-NEXT: .cfi_offset %ebp, -8 90; X86-NEXT: movl %esp, %ebp 91; X86-NEXT: .cfi_def_cfa_register %ebp 92; X86-NEXT: andl $-8, %esp 93; X86-NEXT: subl $8, %esp 94; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 95; X86-NEXT: vaddsd %xmm0, %xmm0, %xmm0 96; X86-NEXT: vmovapd %xmm0, %xmm1 97; X86-NEXT: #ARITH_FENCE 98; X86-NEXT: vaddsd %xmm0, %xmm1, %xmm0 99; X86-NEXT: vmovsd %xmm0, (%esp) 100; X86-NEXT: fldl (%esp) 101; X86-NEXT: movl %ebp, %esp 102; X86-NEXT: popl %ebp 103; X86-NEXT: .cfi_def_cfa %esp, 4 104; X86-NEXT: retl 105; 106; X64-LABEL: f4: 107; X64: # %bb.0: 108; X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0 109; X64-NEXT: vmovapd %xmm0, %xmm1 110; X64-NEXT: #ARITH_FENCE 111; X64-NEXT: vaddsd %xmm0, %xmm1, %xmm0 112; X64-NEXT: retq 113 %1 = fadd fast double %a, %a 114 %t = call double @llvm.arithmetic.fence.f64(double %1) 115 %2 = fadd fast double %a, %a 116 %3 = fadd fast double %t, %2 117 ret double %3 118} 119 120define <2 x float> @f5(<2 x float> %a) { 121; X86-LABEL: f5: 122; X86: # %bb.0: 123; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 124; X86-NEXT: retl 125; 126; X64-LABEL: f5: 127; X64: # %bb.0: 128; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 129; X64-NEXT: retq 130 %1 = fadd fast <2 x float> %a, %a 131 %2 = fadd fast <2 x float> %a, %a 132 %3 = fadd fast <2 x float> %1, %2 133 ret <2 x float> %3 134} 135 136define <2 x float> @f6(<2 x float> %a) { 137; X86-LABEL: f6: 138; X86: # %bb.0: 139; X86-NEXT: vaddps %xmm0, %xmm0, %xmm0 140; X86-NEXT: vmovaps %xmm0, %xmm1 141; X86-NEXT: #ARITH_FENCE 142; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 143; X86-NEXT: retl 144; 145; X64-LABEL: f6: 146; X64: # %bb.0: 147; X64-NEXT: vaddps %xmm0, %xmm0, %xmm0 148; X64-NEXT: vmovaps %xmm0, %xmm1 149; X64-NEXT: #ARITH_FENCE 150; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 151; X64-NEXT: retq 152 %1 = fadd fast <2 x float> %a, %a 153 %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) 154 %2 = fadd fast <2 x float> %a, %a 155 %3 = fadd fast <2 x float> %t, %2 156 ret <2 x float> %3 157} 158 159; This @f7 IR test can be generated from flowing c test: 160; 161; typedef __float128 TYPE; 162; TYPE foo(TYPE *qr) { 163; TYPE re =__arithmetic_fence(*qr); 164; return re; 165;} 166; 167; with flowing build command: 168; clang -cc1 -triple i386-pc-linux-gnu -mreassociate t.c -emit-llvm -O2 169 170define dso_local fp128 @foo(ptr nocapture readonly %qr) local_unnamed_addr{ 171; X86-LABEL: foo: 172; X86: # %bb.0: # %entry 173; X86-NEXT: pushl %edi 174; X86-NEXT: .cfi_def_cfa_offset 8 175; X86-NEXT: pushl %esi 176; X86-NEXT: .cfi_def_cfa_offset 12 177; X86-NEXT: .cfi_offset %esi, -12 178; X86-NEXT: .cfi_offset %edi, -8 179; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 180; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 181; X86-NEXT: movl 12(%ecx), %edx 182; X86-NEXT: movl 8(%ecx), %esi 183; X86-NEXT: movl (%ecx), %edi 184; X86-NEXT: movl 4(%ecx), %ecx 185; X86-NEXT: #ARITH_FENCE 186; X86-NEXT: #ARITH_FENCE 187; X86-NEXT: #ARITH_FENCE 188; X86-NEXT: #ARITH_FENCE 189; X86-NEXT: movl %edx, 12(%eax) 190; X86-NEXT: movl %esi, 8(%eax) 191; X86-NEXT: movl %ecx, 4(%eax) 192; X86-NEXT: movl %edi, (%eax) 193; X86-NEXT: popl %esi 194; X86-NEXT: .cfi_def_cfa_offset 8 195; X86-NEXT: popl %edi 196; X86-NEXT: .cfi_def_cfa_offset 4 197; X86-NEXT: retl $4 198; 199; X64-LABEL: foo: 200; X64: # %bb.0: # %entry 201; X64-NEXT: vmovaps (%rdi), %xmm0 202; X64-NEXT: #ARITH_FENCE 203; X64-NEXT: retq 204entry: 205 %0 = load fp128, ptr %qr, align 16 206 %1 = tail call reassoc fp128 @llvm.arithmetic.fence.f128(fp128 %0) 207 ret fp128 %1 208} 209 210declare fp128 @llvm.arithmetic.fence.f128(fp128) 211declare float @llvm.arithmetic.fence.f32(float) 212declare double @llvm.arithmetic.fence.f64(double) 213declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>) 214