1; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -mattr=+sve -mtriple aarch64-unknown-linux-gnu \ 2; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s 3; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS 4; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-target-instruction-cost=1 -mattr=+sve -mtriple aarch64-unknown-linux-gnu \ 5; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s 6; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS 7 8define void @vec_load(i64 %N, ptr nocapture %a, ptr nocapture readonly %b) { 9; CHECK-LABEL: @vec_load 10; CHECK: vector.body: 11; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, ptr 12; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> %[[LOAD]]) 13entry: 14 %cmp7 = icmp sgt i64 %N, 0 15 br i1 %cmp7, label %for.body, label %for.end 16 17for.body: ; preds = %for.body.preheader, %for.body 18 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 19 %arrayidx = getelementptr inbounds double, ptr %b, i64 %iv 20 %0 = load double, ptr %arrayidx, align 8 21 %1 = call double @foo(double %0) #0 22 %add = fadd double %1, 1.000000e+00 23 %arrayidx2 = getelementptr inbounds double, ptr %a, i64 %iv 24 store double %add, ptr %arrayidx2, align 8 25 %iv.next = add nuw nsw i64 %iv, 1 26 %exitcond.not = icmp eq i64 %iv.next, %N 27 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 28 29for.end: ; preds = %for.body, %entry 30 ret void 31} 32 33define void @vec_scalar(i64 %N, ptr nocapture %a) { 34; CHECK-LABEL: @vec_scalar 35; CHECK: vector.body: 36; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> splat (double 1.000000e+01)) 37entry: 38 %cmp7 = icmp sgt i64 %N, 0 39 br i1 %cmp7, label %for.body, label %for.end 40 41for.body: ; preds = %for.body.preheader, %for.body 42 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 43 %0 = call double @foo(double 10.0) #0 44 %sub = fsub double %0, 1.000000e+00 45 %arrayidx = getelementptr inbounds double, ptr %a, i64 %iv 46 store double %sub, ptr %arrayidx, align 8 47 %iv.next = add nuw nsw i64 %iv, 1 48 %exitcond.not = icmp eq i64 %iv.next, %N 49 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 50 51for.end: ; preds = %for.body, %entry 52 ret void 53} 54 55define void @vec_ptr(i64 %N, ptr noalias %a, ptr readnone %b) { 56; CHECK-LABEL: @vec_ptr 57; CHECK: for.body: 58; CHECK: %[[LOAD:.*]] = load ptr, ptr 59; CHECK: call i64 @bar(ptr %[[LOAD]]) 60entry: 61 %cmp7 = icmp sgt i64 %N, 0 62 br i1 %cmp7, label %for.body, label %for.end 63 64for.body: 65 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 66 %gep = getelementptr ptr, ptr %b, i64 %iv 67 %load = load ptr, ptr %gep 68 %call = call i64 @bar(ptr %load) #1 69 %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv 70 store i64 %call, ptr %arrayidx 71 %iv.next = add nuw nsw i64 %iv, 1 72 %exitcond = icmp eq i64 %iv.next, 1024 73 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 74 75for.end: 76 ret void 77} 78 79define void @vec_intrinsic(i64 %N, ptr nocapture readonly %a) { 80; CHECK-LABEL: @vec_intrinsic 81; CHECK: vector.body: 82; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, ptr 83; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]]) 84entry: 85 %cmp7 = icmp sgt i64 %N, 0 86 br i1 %cmp7, label %for.body, label %for.end 87 88for.body: 89 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 90 %arrayidx = getelementptr inbounds double, ptr %a, i64 %iv 91 %0 = load double, ptr %arrayidx, align 8 92 %1 = call fast double @llvm.sin.f64(double %0) #2 93 %add = fadd fast double %1, 1.000000e+00 94 store double %add, ptr %arrayidx, align 8 95 %iv.next = add nuw nsw i64 %iv, 1 96 %exitcond = icmp eq i64 %iv.next, %N 97 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 98 99for.end: 100 ret void 101} 102 103; CHECK-REMARKS: UserVF ignored because of invalid costs. 104; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load 105; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin 106; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store 107define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) { 108; CHECK: @vec_sin_no_mapping 109; CHECK: call fast <2 x float> @llvm.sin.v2f32 110; CHECK-NOT: <vscale x 111entry: 112 br label %for.body 113 114for.body: ; preds = %entry, %for.body 115 %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 116 %arrayidx = getelementptr inbounds float, ptr %src, i64 %i.07 117 %0 = load float, ptr %arrayidx, align 4, !dbg !11 118 %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12 119 %arrayidx1 = getelementptr inbounds float, ptr %dst, i64 %i.07 120 store float %1, ptr %arrayidx1, align 4, !dbg !13 121 %inc = add nuw nsw i64 %i.07, 1 122 %exitcond.not = icmp eq i64 %inc, %n 123 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 124 125for.cond.cleanup: ; preds = %for.body 126 ret void 127} 128 129; CHECK-REMARKS: UserVF ignored because of invalid costs. 130; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load 131; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): fadd 132; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin 133; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin 134; CHECK-REMARKS-NEXT: t.c:3:40: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store 135define void @vec_sin_no_mapping_ite(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) { 136; CHECK: @vec_sin_no_mapping_ite 137; CHECK-NOT: <vscale x 138; CHECK: ret 139entry: 140 br label %for.body 141 142for.body: ; preds = %entry, %if.end 143 %i.07 = phi i64 [ %inc, %if.end ], [ 0, %entry ] 144 %arrayidx = getelementptr inbounds float, ptr %src, i64 %i.07 145 %0 = load float, ptr %arrayidx, align 4, !dbg !11 146 %cmp = fcmp ugt float %0, 0.0000 147 br i1 %cmp, label %if.then, label %if.else 148if.then: 149 %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12 150 br label %if.end 151if.else: 152 %add = fadd float %0, 12.0, !dbg !13 153 %2 = tail call fast float @llvm.sin.f32(float %add), !dbg !13 154 br label %if.end 155if.end: 156 %3 = phi float [%1, %if.then], [%2, %if.else] 157 %arrayidx1 = getelementptr inbounds float, ptr %dst, i64 %i.07 158 store float %3, ptr %arrayidx1, align 4, !dbg !14 159 %inc = add nuw nsw i64 %i.07, 1 160 %exitcond.not = icmp eq i64 %inc, %n 161 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 162 163for.cond.cleanup: ; preds = %for.body 164 ret void 165} 166 167; CHECK-REMARKS: UserVF ignored because of invalid costs. 168; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load 169; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin 170; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store 171define void @vec_sin_fixed_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) { 172; CHECK: @vec_sin_fixed_mapping 173; CHECK: call fast <2 x float> @llvm.sin.v2f32 174; CHECK-NOT: <vscale x 175entry: 176 br label %for.body 177 178for.body: ; preds = %entry, %for.body 179 %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 180 %arrayidx = getelementptr inbounds float, ptr %src, i64 %i.07 181 %0 = load float, ptr %arrayidx, align 4, !dbg !11 182 %1 = tail call fast float @llvm.sin.f32(float %0) #3, !dbg !12 183 %arrayidx1 = getelementptr inbounds float, ptr %dst, i64 %i.07 184 store float %1, ptr %arrayidx1, align 4, !dbg !13 185 %inc = add nuw nsw i64 %i.07, 1 186 %exitcond.not = icmp eq i64 %inc, %n 187 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 188 189for.cond.cleanup: ; preds = %for.body 190 ret void 191} 192 193; Even though there are no function mappings attached to the call 194; in the loop below we can still vectorize the loop because SVE has 195; hardware support in the form of the 'fqsrt' instruction. 196define void @vec_sqrt_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) { 197; CHECK: @vec_sqrt_no_mapping 198; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32 199entry: 200 br label %for.body 201 202for.body: ; preds = %entry, %for.body 203 %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 204 %arrayidx = getelementptr inbounds float, ptr %src, i64 %i.07 205 %0 = load float, ptr %arrayidx, align 4 206 %1 = tail call fast float @llvm.sqrt.f32(float %0) 207 %arrayidx1 = getelementptr inbounds float, ptr %dst, i64 %i.07 208 store float %1, ptr %arrayidx1, align 4 209 %inc = add nuw nsw i64 %i.07, 1 210 %exitcond.not = icmp eq i64 %inc, %n 211 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 212 213for.cond.cleanup: ; preds = %for.body 214 ret void 215} 216 217 218declare double @foo(double) 219declare i64 @bar(ptr) 220declare double @llvm.sin.f64(double) 221declare float @llvm.sin.f32(float) 222declare float @llvm.sqrt.f32(float) 223 224declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>) 225declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x ptr>) 226declare <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double>) 227declare <2 x double> @sin_vec_v2f64(<2 x double>) 228 229attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } 230attributes #1 = { "vector-function-abi-variant"="_ZGVsNxv_bar(bar_vec)" } 231attributes #2 = { "vector-function-abi-variant"="_ZGVsNxv_llvm.sin.f64(sin_vec_nxv2f64)" } 232attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" } 233 234!1 = distinct !{!1, !2, !3} 235!2 = !{!"llvm.loop.vectorize.width", i32 2} 236!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} 237 238!llvm.dbg.cu = !{!4} 239!llvm.module.flags = !{!7} 240!llvm.ident = !{!8} 241 242!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !5, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !6, splitDebugInlining: false, nameTableKind: None) 243!5 = !DIFile(filename: "t.c", directory: "somedir") 244!6 = !{} 245!7 = !{i32 2, !"Debug Info Version", i32 3} 246!8 = !{!"clang"} 247!9 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 2, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) 248!10 = !DISubroutineType(types: !6) 249!11 = !DILocation(line: 3, column: 10, scope: !9) 250!12 = !DILocation(line: 3, column: 20, scope: !9) 251!13 = !DILocation(line: 3, column: 30, scope: !9) 252!14 = !DILocation(line: 3, column: 40, scope: !9) 253