1; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py 2; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE 4; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE 5; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE 6; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 7; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 8; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 9; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 10; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 11 12define i32 @reduce_f64(i32 %arg) { 13; SSE-LABEL: 'reduce_f64' 14; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 15; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 16; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 17; SSE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 18; SSE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 19; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 20; 21; AVX1-LABEL: 'reduce_f64' 22; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 23; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 24; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 25; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 26; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 27; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 28; 29; AVX2-LABEL: 'reduce_f64' 30; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 31; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 32; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 33; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 34; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 35; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 36; 37; AVX512-LABEL: 'reduce_f64' 38; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 39; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 40; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 41; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 42; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 43; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 44; 45 %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 46 %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 47 %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 48 %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 49 %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 50 ret i32 undef 51} 52 53define i32 @reduce_f32(i32 %arg) { 54; SSE-LABEL: 'reduce_f32' 55; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 56; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 57; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 58; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 59; SSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 60; SSE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 61; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 62; 63; AVX1-LABEL: 'reduce_f32' 64; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 65; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 66; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 67; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 68; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 69; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 70; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 71; 72; AVX2-LABEL: 'reduce_f32' 73; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 74; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 75; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 76; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 77; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 78; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 79; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 80; 81; AVX512-LABEL: 'reduce_f32' 82; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 83; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 84; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 85; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 86; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 87; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 88; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 89; 90 %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 91 %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 92 %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 93 %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 94 %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 95 %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 96 ret i32 undef 97} 98 99; Fast Reductions 100 101define i32 @reduce_f64_fast(i32 %arg) { 102; SSE-LABEL: 'reduce_f64_fast' 103; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 104; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 105; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 106; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 107; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 108; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 109; 110; AVX-LABEL: 'reduce_f64_fast' 111; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 112; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 113; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 114; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 115; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 116; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 117; 118; AVX512-LABEL: 'reduce_f64_fast' 119; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 120; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 121; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 122; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 123; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 124; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 125; 126 %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) 127 %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) 128 %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) 129 %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) 130 %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) 131 ret i32 undef 132} 133 134define i32 @reduce_f32_fast(i32 %arg) { 135; SSE-LABEL: 'reduce_f32_fast' 136; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 137; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 138; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 139; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 140; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 141; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 142; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 143; 144; AVX-LABEL: 'reduce_f32_fast' 145; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 146; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 147; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 148; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 149; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 150; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 151; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 152; 153; AVX512-LABEL: 'reduce_f32_fast' 154; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 155; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 156; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 157; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 158; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 159; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 160; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef 161; 162 %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) 163 %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) 164 %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) 165 %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) 166 %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) 167 %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) 168 ret i32 undef 169} 170 171declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>) 172declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) 173declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) 174declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) 175declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) 176 177declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>) 178declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) 179declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 180declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) 181declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) 182declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) 183