xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll (revision 07d284d4ebffd58d4b2934769b4e11fedd0b106e)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=arm64-apple-macosx11.0.0 -passes=slp-vectorizer -S < %s | FileCheck %s
3
4; Test case reported on D134605 where the vectorization was causing a slowdown due to an underestimation in the cost of the extractions.
5
6; NOTE: cost of shuffle <4 x float>,  <4 x float>, <2 x i32> <i32 2, i32 5> is 12!
7
8define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
9; CHECK-LABEL: @zot(
10; CHECK-NEXT:  bb:
11; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
12; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
13; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
14; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
15; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
16; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
17; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
18; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP2]], <2 x float> [[TMP5]], i64 0)
19; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
20; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
21; CHECK:       bb18:
22; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ]
23; CHECK-NEXT:    [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
24; CHECK-NEXT:    [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
25; CHECK-NEXT:    [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
26; CHECK-NEXT:    [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
27; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
28; CHECK:       bb25:
29; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ]
30; CHECK-NEXT:    br label [[BB30:%.*]]
31; CHECK:       bb30:
32; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
33; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
34; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
35; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
36; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
37; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]]
38; CHECK-NEXT:    [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]])
39; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
40; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
41; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])
42; CHECK-NEXT:    br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
43; CHECK:       bb57:
44; CHECK-NEXT:    ret i64 0
45;
46bb:
47  %val = fmul fast float 0.000000e+00, 0.000000e+00
48  %val9 = fmul fast float 0.000000e+00, %arg
49  %val10 = fmul fast float %arg3, 1.000000e+00
50  %val11 = fmul fast float %arg3, 1.000000e+00
51  %val12 = fadd fast float %arg3, 1.000000e+00
52  %val13 = fadd fast float %val12, 2.000000e+00
53  %val14 = fadd fast float 0.000000e+00, 0.000000e+00
54  %val15 = fadd fast float %val14, 1.000000e+00
55  %val16 = fadd fast float %arg3, 1.000000e+00
56  %val17 = fadd fast float %arg3, 1.000000e+00
57  br i1 %arg6, label %bb18, label %bb57
58
59bb18:                                             ; preds = %bb
60  %val19 = phi float [ %val13, %bb ]
61  %val20 = phi float [ %val15, %bb ]
62  %val21 = phi float [ %val16, %bb ]
63  %val22 = phi float [ %val17, %bb ]
64  %val23 = fmul fast float %val16, 2.000000e+00
65  %val24 = fmul fast float %val17, 3.000000e+00
66  br i1 %arg7, label %bb25, label %bb57
67
68bb25:                                             ; preds = %bb18
69  %val26 = phi float [ %val19, %bb18 ]
70  %val27 = phi float [ %val20, %bb18 ]
71  %val28 = phi float [ %val21, %bb18 ]
72  %val29 = phi float [ %val22, %bb18 ]
73  br label %bb30
74
75bb30:                                             ; preds = %bb30, %bb25
76  %val31 = phi float [ %val55, %bb30 ], [ 0.000000e+00, %bb25 ]
77  %val32 = phi float [ %val9, %bb30 ], [ 0.000000e+00, %bb25 ]
78  %val33 = load i8, ptr %arg5, align 1
79  %val34 = uitofp i8 %val33 to float
80  %val35 = getelementptr inbounds i8, ptr %arg5, i64 1
81  %val36 = load i8, ptr %val35, align 1
82  %val37 = uitofp i8 %val36 to float
83  %val38 = getelementptr inbounds i8, ptr %arg5, i64 2
84  %val39 = load i8, ptr %val38, align 1
85  %val40 = uitofp i8 %val39 to float
86  %val41 = getelementptr inbounds i8, ptr %arg5, i64 3
87  %val42 = load i8, ptr %val41, align 1
88  %val43 = uitofp i8 %val42 to float
89  %val44 = fsub fast float %val34, %val
90  %val45 = fsub fast float %val37, %val9
91  %val46 = fsub fast float %val40, %val10
92  %val47 = fsub fast float %val43, %val11
93  %val48 = fmul fast float %val44, %val26
94  %val49 = fmul fast float %val45, %val27
95  %val50 = fadd fast float %val49, %val48
96  %val51 = fmul fast float %val46, %val28
97  %val52 = fadd fast float %val50, %val51
98  %val53 = fmul fast float %val47, %val29
99  %val54 = fadd fast float %val52, %val53
100  %val55 = tail call fast float @llvm.minnum.f32(float %val31, float %arg1)
101  %val56 = tail call fast float @llvm.maxnum.f32(float %arg2, float %val54)
102  call void @ham(float %val55, float %val56)
103  br i1 %arg8, label %bb30, label %bb57
104
105bb57:                                             ; preds = %bb30, %bb18, %bb
106  ret i64 0
107}
108
109declare float @llvm.maxnum.f32(float, float)
110declare float @llvm.minnum.f32(float, float)
111declare void @ham(float, float)
112