1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX 7 8; Partial load dot product patterns based off PR51075 9 10; 11; dot3(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])) 12; 13 14define float @dot3_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 15; SSE2-LABEL: dot3_float4: 16; SSE2: # %bb.0: 17; SSE2-NEXT: movups (%rdi), %xmm0 18; SSE2-NEXT: movups (%rsi), %xmm1 19; SSE2-NEXT: mulps %xmm0, %xmm1 20; SSE2-NEXT: movaps %xmm1, %xmm0 21; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 22; SSE2-NEXT: addss %xmm1, %xmm0 23; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 24; SSE2-NEXT: addss %xmm1, %xmm0 25; SSE2-NEXT: retq 26; 27; SSSE3-LABEL: dot3_float4: 28; SSSE3: # %bb.0: 29; SSSE3-NEXT: movups (%rdi), %xmm0 30; SSSE3-NEXT: movups (%rsi), %xmm1 31; SSSE3-NEXT: mulps %xmm0, %xmm1 32; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 33; SSSE3-NEXT: addss %xmm1, %xmm0 34; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 35; SSSE3-NEXT: addss %xmm1, %xmm0 36; SSSE3-NEXT: retq 37; 38; SSE41-LABEL: dot3_float4: 39; SSE41: # %bb.0: 40; SSE41-NEXT: movups (%rdi), %xmm0 41; SSE41-NEXT: movups (%rsi), %xmm1 42; SSE41-NEXT: mulps %xmm0, %xmm1 43; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 44; SSE41-NEXT: addss %xmm1, %xmm0 45; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 46; SSE41-NEXT: addss %xmm1, %xmm0 47; SSE41-NEXT: retq 48; 49; AVX-LABEL: dot3_float4: 50; AVX: # %bb.0: 51; AVX-NEXT: vmovups (%rdi), %xmm0 52; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 53; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 54; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 55; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 56; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 57; AVX-NEXT: retq 58 %x0123 = load <4 x float>, ptr %a0, align 4 59 %y0123 = load <4 x float>, ptr %a1, align 4 60 %mul0123 = fmul <4 x float> %x0123, %y0123 61 %mul0 = extractelement <4 x float> %mul0123, i32 0 62 %mul1 = extractelement <4 x float> %mul0123, i32 1 63 %mul2 = extractelement <4 x float> %mul0123, i32 2 64 %dot01 = fadd float %mul0, %mul1 65 %dot012 = fadd float %dot01, %mul2 66 ret float %dot012 67} 68 69define float @dot3_float4_as_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 70; SSE2-LABEL: dot3_float4_as_float3: 71; SSE2: # %bb.0: 72; SSE2-NEXT: movups (%rdi), %xmm0 73; SSE2-NEXT: movups (%rsi), %xmm1 74; SSE2-NEXT: mulps %xmm0, %xmm1 75; SSE2-NEXT: movaps %xmm1, %xmm0 76; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 77; SSE2-NEXT: addss %xmm1, %xmm0 78; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 79; SSE2-NEXT: addss %xmm1, %xmm0 80; SSE2-NEXT: retq 81; 82; SSSE3-LABEL: dot3_float4_as_float3: 83; SSSE3: # %bb.0: 84; SSSE3-NEXT: movups (%rdi), %xmm0 85; SSSE3-NEXT: movups (%rsi), %xmm1 86; SSSE3-NEXT: mulps %xmm0, %xmm1 87; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 88; SSSE3-NEXT: addss %xmm1, %xmm0 89; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 90; SSSE3-NEXT: addss %xmm1, %xmm0 91; SSSE3-NEXT: retq 92; 93; SSE41-LABEL: dot3_float4_as_float3: 94; SSE41: # %bb.0: 95; SSE41-NEXT: movups (%rdi), %xmm0 96; SSE41-NEXT: movups (%rsi), %xmm1 97; SSE41-NEXT: mulps %xmm0, %xmm1 98; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 99; SSE41-NEXT: addss %xmm1, %xmm0 100; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 101; SSE41-NEXT: addss %xmm1, %xmm0 102; SSE41-NEXT: retq 103; 104; AVX-LABEL: dot3_float4_as_float3: 105; AVX: # %bb.0: 106; AVX-NEXT: vmovups (%rdi), %xmm0 107; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 108; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 109; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 110; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 111; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 112; AVX-NEXT: retq 113 %x0123 = load <4 x float>, ptr %a0, align 4 114 %y0123 = load <4 x float>, ptr %a1, align 4 115 %x012 = shufflevector <4 x float> %x0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 116 %y012 = shufflevector <4 x float> %y0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 117 %mul012 = fmul <3 x float> %x012, %y012 118 %mul0 = extractelement <3 x float> %mul012, i32 0 119 %mul1 = extractelement <3 x float> %mul012, i32 1 120 %mul2 = extractelement <3 x float> %mul012, i32 2 121 %dot01 = fadd float %mul0, %mul1 122 %dot012 = fadd float %dot01, %mul2 123 ret float %dot012 124} 125 126define float @dot3_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 127; SSE2-LABEL: dot3_float3: 128; SSE2: # %bb.0: 129; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 130; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 131; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 132; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 133; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 134; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 135; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] 136; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] 137; SSE2-NEXT: mulps %xmm0, %xmm1 138; SSE2-NEXT: movaps %xmm1, %xmm0 139; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 140; SSE2-NEXT: addss %xmm1, %xmm0 141; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 142; SSE2-NEXT: addss %xmm1, %xmm0 143; SSE2-NEXT: retq 144; 145; SSSE3-LABEL: dot3_float3: 146; SSSE3: # %bb.0: 147; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 148; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 149; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 150; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 151; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 152; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 153; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] 154; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] 155; SSSE3-NEXT: mulps %xmm0, %xmm1 156; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 157; SSSE3-NEXT: addss %xmm1, %xmm0 158; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 159; SSSE3-NEXT: addss %xmm1, %xmm0 160; SSSE3-NEXT: retq 161; 162; SSE41-LABEL: dot3_float3: 163; SSE41: # %bb.0: 164; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 165; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 166; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 167; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 168; SSE41-NEXT: mulps %xmm0, %xmm1 169; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 170; SSE41-NEXT: addss %xmm1, %xmm0 171; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 172; SSE41-NEXT: addss %xmm1, %xmm0 173; SSE41-NEXT: retq 174; 175; AVX-LABEL: dot3_float3: 176; AVX: # %bb.0: 177; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 178; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 179; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 180; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 181; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 182; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 183; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 184; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 185; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 186; AVX-NEXT: retq 187 %x012 = load <3 x float>, ptr %a0, align 4 188 %y012 = load <3 x float>, ptr %a1, align 4 189 %mul012 = fmul <3 x float> %x012, %y012 190 %mul0 = extractelement <3 x float> %mul012, i32 0 191 %mul1 = extractelement <3 x float> %mul012, i32 1 192 %mul2 = extractelement <3 x float> %mul012, i32 2 193 %dot01 = fadd float %mul0, %mul1 194 %dot012 = fadd float %dot01, %mul2 195 ret float %dot012 196} 197 198define float @dot3_float2_float(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 199; SSE2-LABEL: dot3_float2_float: 200; SSE2: # %bb.0: 201; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 202; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 203; SSE2-NEXT: mulps %xmm0, %xmm1 204; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 205; SSE2-NEXT: mulss 8(%rsi), %xmm2 206; SSE2-NEXT: movaps %xmm1, %xmm0 207; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 208; SSE2-NEXT: addss %xmm1, %xmm0 209; SSE2-NEXT: addss %xmm2, %xmm0 210; SSE2-NEXT: retq 211; 212; SSSE3-LABEL: dot3_float2_float: 213; SSSE3: # %bb.0: 214; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 215; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 216; SSSE3-NEXT: mulps %xmm0, %xmm1 217; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 218; SSSE3-NEXT: mulss 8(%rsi), %xmm2 219; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 220; SSSE3-NEXT: addss %xmm1, %xmm0 221; SSSE3-NEXT: addss %xmm2, %xmm0 222; SSSE3-NEXT: retq 223; 224; SSE41-LABEL: dot3_float2_float: 225; SSE41: # %bb.0: 226; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 227; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 228; SSE41-NEXT: mulps %xmm0, %xmm1 229; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 230; SSE41-NEXT: mulss 8(%rsi), %xmm2 231; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 232; SSE41-NEXT: addss %xmm1, %xmm0 233; SSE41-NEXT: addss %xmm2, %xmm0 234; SSE41-NEXT: retq 235; 236; AVX-LABEL: dot3_float2_float: 237; AVX: # %bb.0: 238; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 239; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 240; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 241; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 242; AVX-NEXT: vmulss 8(%rsi), %xmm1, %xmm1 243; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 244; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 245; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 246; AVX-NEXT: retq 247 %x01 = load <2 x float>, ptr %a0, align 4 248 %y01 = load <2 x float>, ptr %a1, align 4 249 %ptrx2 = getelementptr inbounds float, ptr %a0, i64 2 250 %ptry2 = getelementptr inbounds float, ptr %a1, i64 2 251 %x2 = load float, ptr %ptrx2, align 4 252 %y2 = load float, ptr %ptry2, align 4 253 %mul01 = fmul <2 x float> %x01, %y01 254 %mul2 = fmul float %x2, %y2 255 %mul0 = extractelement <2 x float> %mul01, i32 0 256 %mul1 = extractelement <2 x float> %mul01, i32 1 257 %dot01 = fadd float %mul0, %mul1 258 %dot012 = fadd float %dot01, %mul2 259 ret float %dot012 260} 261 262define float @dot3_float_float2(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 263; SSE2-LABEL: dot3_float_float2: 264; SSE2: # %bb.0: 265; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 266; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 267; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 268; SSE2-NEXT: mulps %xmm2, %xmm0 269; SSE2-NEXT: mulss (%rsi), %xmm1 270; SSE2-NEXT: addss %xmm0, %xmm1 271; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 272; SSE2-NEXT: addss %xmm1, %xmm0 273; SSE2-NEXT: retq 274; 275; SSSE3-LABEL: dot3_float_float2: 276; SSSE3: # %bb.0: 277; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 278; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 279; SSSE3-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 280; SSSE3-NEXT: mulps %xmm1, %xmm2 281; SSSE3-NEXT: mulss (%rsi), %xmm0 282; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 283; SSSE3-NEXT: addss %xmm2, %xmm0 284; SSSE3-NEXT: addss %xmm1, %xmm0 285; SSSE3-NEXT: retq 286; 287; SSE41-LABEL: dot3_float_float2: 288; SSE41: # %bb.0: 289; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 290; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 291; SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 292; SSE41-NEXT: mulps %xmm1, %xmm2 293; SSE41-NEXT: mulss (%rsi), %xmm0 294; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 295; SSE41-NEXT: addss %xmm2, %xmm0 296; SSE41-NEXT: addss %xmm1, %xmm0 297; SSE41-NEXT: retq 298; 299; AVX-LABEL: dot3_float_float2: 300; AVX: # %bb.0: 301; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 302; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 303; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 304; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 305; AVX-NEXT: vmulss (%rsi), %xmm0, %xmm0 306; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 307; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 308; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 309; AVX-NEXT: retq 310 %x0 = load float, ptr %a0, align 4 311 %y0 = load float, ptr %a1, align 4 312 %ptrx12 = getelementptr inbounds float, ptr %a0, i64 1 313 %ptry12 = getelementptr inbounds float, ptr %a1, i64 1 314 %x12 = load <2 x float>, ptr %ptrx12, align 4 315 %y12 = load <2 x float>, ptr %ptry12, align 4 316 %mul0 = fmul float %x0, %y0 317 %mul12 = fmul <2 x float> %x12, %y12 318 %mul1 = extractelement <2 x float> %mul12, i32 0 319 %mul2 = extractelement <2 x float> %mul12, i32 1 320 %dot01 = fadd float %mul0, %mul1 321 %dot012 = fadd float %dot01, %mul2 322 ret float %dot012 323} 324 325; 326; dot2(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])) 327; 328 329define float @dot2_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 330; SSE2-LABEL: dot2_float4: 331; SSE2: # %bb.0: 332; SSE2-NEXT: movups (%rdi), %xmm0 333; SSE2-NEXT: movups (%rsi), %xmm1 334; SSE2-NEXT: mulps %xmm0, %xmm1 335; SSE2-NEXT: movaps %xmm1, %xmm0 336; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 337; SSE2-NEXT: addss %xmm1, %xmm0 338; SSE2-NEXT: retq 339; 340; SSSE3-LABEL: dot2_float4: 341; SSSE3: # %bb.0: 342; SSSE3-NEXT: movups (%rdi), %xmm0 343; SSSE3-NEXT: movups (%rsi), %xmm1 344; SSSE3-NEXT: mulps %xmm0, %xmm1 345; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 346; SSSE3-NEXT: addss %xmm1, %xmm0 347; SSSE3-NEXT: retq 348; 349; SSE41-LABEL: dot2_float4: 350; SSE41: # %bb.0: 351; SSE41-NEXT: movups (%rdi), %xmm0 352; SSE41-NEXT: movups (%rsi), %xmm1 353; SSE41-NEXT: mulps %xmm0, %xmm1 354; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 355; SSE41-NEXT: addss %xmm1, %xmm0 356; SSE41-NEXT: retq 357; 358; AVX-LABEL: dot2_float4: 359; AVX: # %bb.0: 360; AVX-NEXT: vmovups (%rdi), %xmm0 361; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 362; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 363; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 364; AVX-NEXT: retq 365 %x0123 = load <4 x float>, ptr %a0, align 4 366 %y0123 = load <4 x float>, ptr %a1, align 4 367 %mul0123 = fmul <4 x float> %x0123, %y0123 368 %mul0 = extractelement <4 x float> %mul0123, i32 0 369 %mul1 = extractelement <4 x float> %mul0123, i32 1 370 %dot01 = fadd float %mul0, %mul1 371 ret float %dot01 372} 373 374define float @dot2_float2(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) { 375; SSE2-LABEL: dot2_float2: 376; SSE2: # %bb.0: 377; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 378; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 379; SSE2-NEXT: mulps %xmm0, %xmm1 380; SSE2-NEXT: movaps %xmm1, %xmm0 381; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 382; SSE2-NEXT: addss %xmm1, %xmm0 383; SSE2-NEXT: retq 384; 385; SSSE3-LABEL: dot2_float2: 386; SSSE3: # %bb.0: 387; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 388; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 389; SSSE3-NEXT: mulps %xmm0, %xmm1 390; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 391; SSSE3-NEXT: addss %xmm1, %xmm0 392; SSSE3-NEXT: retq 393; 394; SSE41-LABEL: dot2_float2: 395; SSE41: # %bb.0: 396; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 397; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 398; SSE41-NEXT: mulps %xmm0, %xmm1 399; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 400; SSE41-NEXT: addss %xmm1, %xmm0 401; SSE41-NEXT: retq 402; 403; AVX-LABEL: dot2_float2: 404; AVX: # %bb.0: 405; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 406; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 407; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 408; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 409; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 410; AVX-NEXT: retq 411 %x01 = load <2 x float>, ptr %a0, align 4 412 %y01 = load <2 x float>, ptr %a1, align 4 413 %mul01 = fmul <2 x float> %x01, %y01 414 %mul0 = extractelement <2 x float> %mul01, i32 0 415 %mul1 = extractelement <2 x float> %mul01, i32 1 416 %dot01 = fadd float %mul0, %mul1 417 ret float %dot01 418} 419