1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 4 5define double @f1(double %a) { 6; X86-LABEL: f1: 7; X86: # %bb.0: 8; X86-NEXT: pushl %ebp 9; X86-NEXT: .cfi_def_cfa_offset 8 10; X86-NEXT: .cfi_offset %ebp, -8 11; X86-NEXT: movl %esp, %ebp 12; X86-NEXT: .cfi_def_cfa_register %ebp 13; X86-NEXT: andl $-8, %esp 14; X86-NEXT: subl $8, %esp 15; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 16; X86-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 17; X86-NEXT: movsd %xmm0, (%esp) 18; X86-NEXT: fldl (%esp) 19; X86-NEXT: movl %ebp, %esp 20; X86-NEXT: popl %ebp 21; X86-NEXT: .cfi_def_cfa %esp, 4 22; X86-NEXT: retl 23; 24; X64-LABEL: f1: 25; X64: # %bb.0: 26; X64-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 27; X64-NEXT: retq 28 %1 = fadd fast double %a, %a 29 %2 = fadd fast double %a, %a 30 %3 = fadd fast double %1, %2 31 ret double %3 32} 33 34define double @f2(double %a) { 35; X86-LABEL: f2: 36; X86: # %bb.0: 37; X86-NEXT: pushl %ebp 38; X86-NEXT: .cfi_def_cfa_offset 8 39; X86-NEXT: .cfi_offset %ebp, -8 40; X86-NEXT: movl %esp, %ebp 41; X86-NEXT: .cfi_def_cfa_register %ebp 42; X86-NEXT: andl $-8, %esp 43; X86-NEXT: subl $8, %esp 44; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 45; X86-NEXT: addsd %xmm0, %xmm0 46; X86-NEXT: movapd %xmm0, %xmm1 47; X86-NEXT: #ARITH_FENCE 48; X86-NEXT: addsd %xmm0, %xmm1 49; X86-NEXT: movsd %xmm1, (%esp) 50; X86-NEXT: fldl (%esp) 51; X86-NEXT: movl %ebp, %esp 52; X86-NEXT: popl %ebp 53; X86-NEXT: .cfi_def_cfa %esp, 4 54; X86-NEXT: retl 55; 56; X64-LABEL: f2: 57; X64: # %bb.0: 58; X64-NEXT: addsd %xmm0, %xmm0 59; X64-NEXT: movapd %xmm0, %xmm1 60; X64-NEXT: #ARITH_FENCE 61; X64-NEXT: addsd %xmm1, %xmm0 62; X64-NEXT: retq 63 %1 = fadd fast double %a, %a 64 %t = call double @llvm.arithmetic.fence.f64(double %1) 65 %2 = fadd fast double %a, %a 66 %3 = fadd fast double %t, %2 67 ret double %3 68} 69 70define <2 x float> @f3(<2 x float> %a) { 71; X86-LABEL: f3: 72; X86: # %bb.0: 73; X86-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 74; X86-NEXT: retl 75; 76; X64-LABEL: f3: 77; X64: # %bb.0: 78; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 79; X64-NEXT: retq 80 %1 = fadd fast <2 x float> %a, %a 81 %2 = fadd fast <2 x float> %a, %a 82 %3 = fadd fast <2 x float> %1, %2 83 ret <2 x float> %3 84} 85 86define <2 x float> @f4(<2 x float> %a) { 87; X86-LABEL: f4: 88; X86: # %bb.0: 89; X86-NEXT: addps %xmm0, %xmm0 90; X86-NEXT: movaps %xmm0, %xmm1 91; X86-NEXT: #ARITH_FENCE 92; X86-NEXT: addps %xmm1, %xmm0 93; X86-NEXT: retl 94; 95; X64-LABEL: f4: 96; X64: # %bb.0: 97; X64-NEXT: addps %xmm0, %xmm0 98; X64-NEXT: movaps %xmm0, %xmm1 99; X64-NEXT: #ARITH_FENCE 100; X64-NEXT: addps %xmm1, %xmm0 101; X64-NEXT: retq 102 %1 = fadd fast <2 x float> %a, %a 103 %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) 104 %2 = fadd fast <2 x float> %a, %a 105 %3 = fadd fast <2 x float> %t, %2 106 ret <2 x float> %3 107} 108 109define <8 x float> @f5(<8 x float> %a) { 110; X86-LABEL: f5: 111; X86: # %bb.0: 112; X86-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] 113; X86-NEXT: mulps %xmm2, %xmm0 114; X86-NEXT: mulps %xmm2, %xmm1 115; X86-NEXT: retl 116; 117; X64-LABEL: f5: 118; X64: # %bb.0: 119; X64-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] 120; X64-NEXT: mulps %xmm2, %xmm0 121; X64-NEXT: mulps %xmm2, %xmm1 122; X64-NEXT: retq 123 %1 = fadd fast <8 x float> %a, %a 124 %2 = fadd fast <8 x float> %a, %a 125 %3 = fadd fast <8 x float> %1, %2 126 ret <8 x float> %3 127} 128 129define <8 x float> @f6(<8 x float> %a) { 130; X86-LABEL: f6: 131; X86: # %bb.0: 132; X86-NEXT: addps %xmm0, %xmm0 133; X86-NEXT: addps %xmm1, %xmm1 134; X86-NEXT: movaps %xmm1, %xmm2 135; X86-NEXT: #ARITH_FENCE 136; X86-NEXT: movaps %xmm0, %xmm3 137; X86-NEXT: #ARITH_FENCE 138; X86-NEXT: addps %xmm3, %xmm0 139; X86-NEXT: addps %xmm2, %xmm1 140; X86-NEXT: retl 141; 142; X64-LABEL: f6: 143; X64: # %bb.0: 144; X64-NEXT: addps %xmm0, %xmm0 145; X64-NEXT: addps %xmm1, %xmm1 146; X64-NEXT: movaps %xmm1, %xmm2 147; X64-NEXT: #ARITH_FENCE 148; X64-NEXT: movaps %xmm0, %xmm3 149; X64-NEXT: #ARITH_FENCE 150; X64-NEXT: addps %xmm3, %xmm0 151; X64-NEXT: addps %xmm2, %xmm1 152; X64-NEXT: retq 153 %1 = fadd fast <8 x float> %a, %a 154 %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1) 155 %2 = fadd fast <8 x float> %a, %a 156 %3 = fadd fast <8 x float> %t, %2 157 ret <8 x float> %3 158} 159 160define half @f7(half %a) nounwind { 161; X86-LABEL: f7: 162; X86: # %bb.0: 163; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 164; X86-NEXT: #ARITH_FENCE 165; X86-NEXT: retl 166; 167; X64-LABEL: f7: 168; X64: # %bb.0: 169; X64-NEXT: #ARITH_FENCE 170; X64-NEXT: retq 171 %b = call half @llvm.arithmetic.fence.f16(half %a) 172 ret half %b 173} 174 175define bfloat @f8(bfloat %a) nounwind { 176; X86-LABEL: f8: 177; X86: # %bb.0: 178; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 179; X86-NEXT: #ARITH_FENCE 180; X86-NEXT: pinsrw $0, %eax, %xmm0 181; X86-NEXT: retl 182; 183; X64-LABEL: f8: 184; X64: # %bb.0: 185; X64-NEXT: pextrw $0, %xmm0, %eax 186; X64-NEXT: #ARITH_FENCE 187; X64-NEXT: pinsrw $0, %eax, %xmm0 188; X64-NEXT: retq 189 %b = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a) 190 ret bfloat %b 191} 192 193define <2 x half> @f9(<2 x half> %a) nounwind { 194; X86-LABEL: f9: 195; X86: # %bb.0: 196; X86-NEXT: movdqa %xmm0, %xmm1 197; X86-NEXT: psrld $16, %xmm1 198; X86-NEXT: #ARITH_FENCE 199; X86-NEXT: #ARITH_FENCE 200; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 201; X86-NEXT: retl 202; 203; X64-LABEL: f9: 204; X64: # %bb.0: 205; X64-NEXT: movdqa %xmm0, %xmm1 206; X64-NEXT: psrld $16, %xmm1 207; X64-NEXT: #ARITH_FENCE 208; X64-NEXT: #ARITH_FENCE 209; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 210; X64-NEXT: retq 211 %b = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %a) 212 ret <2 x half> %b 213} 214 215define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind { 216; X86-LABEL: f10: 217; X86: # %bb.0: 218; X86-NEXT: pextrw $0, %xmm0, %eax 219; X86-NEXT: movdqa %xmm0, %xmm1 220; X86-NEXT: psrld $16, %xmm1 221; X86-NEXT: pextrw $0, %xmm1, %ecx 222; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 223; X86-NEXT: pextrw $0, %xmm0, %edx 224; X86-NEXT: #ARITH_FENCE 225; X86-NEXT: #ARITH_FENCE 226; X86-NEXT: #ARITH_FENCE 227; X86-NEXT: pinsrw $0, %eax, %xmm0 228; X86-NEXT: pinsrw $0, %ecx, %xmm1 229; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 230; X86-NEXT: pinsrw $0, %edx, %xmm1 231; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 232; X86-NEXT: retl 233; 234; X64-LABEL: f10: 235; X64: # %bb.0: 236; X64-NEXT: pextrw $0, %xmm0, %eax 237; X64-NEXT: movdqa %xmm0, %xmm1 238; X64-NEXT: psrld $16, %xmm1 239; X64-NEXT: pextrw $0, %xmm1, %ecx 240; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 241; X64-NEXT: pextrw $0, %xmm0, %edx 242; X64-NEXT: #ARITH_FENCE 243; X64-NEXT: #ARITH_FENCE 244; X64-NEXT: #ARITH_FENCE 245; X64-NEXT: pinsrw $0, %eax, %xmm0 246; X64-NEXT: pinsrw $0, %ecx, %xmm1 247; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 248; X64-NEXT: pinsrw $0, %edx, %xmm1 249; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 250; X64-NEXT: retq 251 %b = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> %a) 252 ret <3 x bfloat> %b 253} 254 255define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind { 256; X86-LABEL: f11: 257; X86: # %bb.0: 258; X86-NEXT: pushl %esi 259; X86-NEXT: movdqa %xmm0, %xmm1 260; X86-NEXT: psrlq $48, %xmm1 261; X86-NEXT: pextrw $0, %xmm1, %eax 262; X86-NEXT: movdqa %xmm0, %xmm1 263; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 264; X86-NEXT: pextrw $0, %xmm1, %edx 265; X86-NEXT: pextrw $0, %xmm0, %ecx 266; X86-NEXT: psrld $16, %xmm0 267; X86-NEXT: pextrw $0, %xmm0, %esi 268; X86-NEXT: #ARITH_FENCE 269; X86-NEXT: #ARITH_FENCE 270; X86-NEXT: #ARITH_FENCE 271; X86-NEXT: #ARITH_FENCE 272; X86-NEXT: pinsrw $0, %eax, %xmm0 273; X86-NEXT: pinsrw $0, %edx, %xmm1 274; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 275; X86-NEXT: pinsrw $0, %ecx, %xmm0 276; X86-NEXT: pinsrw $0, %esi, %xmm2 277; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 278; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 279; X86-NEXT: popl %esi 280; X86-NEXT: retl 281; 282; X64-LABEL: f11: 283; X64: # %bb.0: 284; X64-NEXT: movdqa %xmm0, %xmm1 285; X64-NEXT: psrlq $48, %xmm1 286; X64-NEXT: pextrw $0, %xmm1, %eax 287; X64-NEXT: movdqa %xmm0, %xmm1 288; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 289; X64-NEXT: pextrw $0, %xmm1, %ecx 290; X64-NEXT: pextrw $0, %xmm0, %edx 291; X64-NEXT: psrld $16, %xmm0 292; X64-NEXT: pextrw $0, %xmm0, %esi 293; X64-NEXT: #ARITH_FENCE 294; X64-NEXT: #ARITH_FENCE 295; X64-NEXT: #ARITH_FENCE 296; X64-NEXT: #ARITH_FENCE 297; X64-NEXT: pinsrw $0, %eax, %xmm0 298; X64-NEXT: pinsrw $0, %ecx, %xmm1 299; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 300; X64-NEXT: pinsrw $0, %edx, %xmm0 301; X64-NEXT: pinsrw $0, %esi, %xmm2 302; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 303; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 304; X64-NEXT: retq 305 %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %a) 306 ret <4 x bfloat> %b 307} 308 309declare half @llvm.arithmetic.fence.f16(half) 310declare bfloat @llvm.arithmetic.fence.bf16(bfloat) 311declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>) 312declare <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat>) 313declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>) 314declare float @llvm.arithmetic.fence.f32(float) 315declare double @llvm.arithmetic.fence.f64(double) 316declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>) 317declare <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float>) 318