1; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py 2; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 4; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 5; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 6; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 7; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 8; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 9; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 10; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 11 12define void @reduce_f64(double %arg) { 13; SSE2-LABEL: 'reduce_f64' 14; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 15; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 16; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 17; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 18; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 19; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 20; 21; SSSE3-LABEL: 'reduce_f64' 22; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 23; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 24; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 25; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 26; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 27; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 28; 29; SSE41-LABEL: 'reduce_f64' 30; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 31; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 32; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 33; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 34; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 35; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 36; 37; SSE42-LABEL: 'reduce_f64' 38; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 39; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 40; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 41; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 42; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 43; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 44; 45; AVX1-LABEL: 'reduce_f64' 46; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 47; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 48; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 49; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 50; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 51; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 52; 53; AVX2-LABEL: 'reduce_f64' 54; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 55; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 56; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 57; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 58; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 59; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 60; 61; AVX512-LABEL: 'reduce_f64' 62; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 63; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 64; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 65; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 66; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 67; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 68; 69 %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 70 %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 71 %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 72 %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 73 %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 74 ret void 75} 76 77define void @reduce_f32(float %arg) { 78; SSE2-LABEL: 'reduce_f32' 79; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 80; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 81; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 82; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 83; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 84; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 85; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 86; 87; SSSE3-LABEL: 'reduce_f32' 88; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 89; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 90; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 91; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 92; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 93; SSSE3-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 94; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 95; 96; SSE41-LABEL: 'reduce_f32' 97; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 98; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 99; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 100; SSE41-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 101; SSE41-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 102; SSE41-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 103; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 104; 105; SSE42-LABEL: 'reduce_f32' 106; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 107; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 108; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 109; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 110; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 111; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 112; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 113; 114; AVX1-LABEL: 'reduce_f32' 115; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 116; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 117; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 118; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 119; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 120; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 121; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 122; 123; AVX2-LABEL: 'reduce_f32' 124; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 125; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 126; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 127; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 128; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 129; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 130; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 131; 132; AVX512-LABEL: 'reduce_f32' 133; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 134; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 135; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 136; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 137; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 138; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 139; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 140; 141 %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 142 %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 143 %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 144 %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 145 %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 146 %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 147 ret void 148} 149 150; Fast Reductions 151 152define void @reduce_f64_fast(double %arg) { 153; SSE2-LABEL: 'reduce_f64_fast' 154; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 155; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 156; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 157; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 158; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 159; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 160; 161; SSSE3-LABEL: 'reduce_f64_fast' 162; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 163; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 164; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 165; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 166; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 167; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 168; 169; SSE41-LABEL: 'reduce_f64_fast' 170; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 171; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 172; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 173; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 174; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 175; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 176; 177; SSE42-LABEL: 'reduce_f64_fast' 178; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 179; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 180; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 181; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 182; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 183; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 184; 185; AVX1-LABEL: 'reduce_f64_fast' 186; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 187; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 188; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 189; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 190; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 191; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 192; 193; AVX2-LABEL: 'reduce_f64_fast' 194; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 195; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 196; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 197; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 198; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 199; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 200; 201; AVX512-LABEL: 'reduce_f64_fast' 202; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 203; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 204; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 205; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 206; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 207; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 208; 209 %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) 210 %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) 211 %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) 212 %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) 213 %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) 214 ret void 215} 216 217define void @reduce_f32_fast(float %arg) { 218; SSE2-LABEL: 'reduce_f32_fast' 219; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 220; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 221; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 222; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 223; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 224; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 225; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 226; 227; SSSE3-LABEL: 'reduce_f32_fast' 228; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 229; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 230; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 231; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 232; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 233; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 234; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 235; 236; SSE41-LABEL: 'reduce_f32_fast' 237; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 238; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 239; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 240; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 241; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 242; SSE41-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 243; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 244; 245; SSE42-LABEL: 'reduce_f32_fast' 246; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 247; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 248; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 249; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 250; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 251; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 252; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 253; 254; AVX1-LABEL: 'reduce_f32_fast' 255; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 256; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 257; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 258; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 259; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 260; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 261; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 262; 263; AVX2-LABEL: 'reduce_f32_fast' 264; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 265; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 266; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 267; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 268; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 269; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 270; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 271; 272; AVX512-LABEL: 'reduce_f32_fast' 273; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 274; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 275; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 276; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 277; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 278; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 279; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void 280; 281 %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) 282 %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) 283 %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) 284 %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) 285 %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) 286 %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) 287 ret void 288} 289 290declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>) 291declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) 292declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) 293declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) 294declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) 295 296declare float @llvm.vector.reduce.fadd.v1f32(float, <1 x float>) 297declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) 298declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) 299declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) 300declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) 301declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>) 302