1; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse -earlycse-debug-hash | FileCheck %s 2; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s 3 4define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 5entry: 6; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 7; CHECK-LABEL: @test_cse 8; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 9 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 10 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 11 br label %for.cond 12 13for.cond: ; preds = %for.body, %entry 14 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 15 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 16 %cmp = icmp slt i32 %i.0, %n 17 br i1 %cmp, label %for.body, label %for.end 18 19for.body: ; preds = %for.cond 20 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 21 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 22 %2 = bitcast <16 x i8> %0 to <4 x i32> 23 %3 = bitcast <16 x i8> %1 to <4 x i32> 24 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a) 25 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) 26 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 27 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 28 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 29 %inc = add nsw i32 %i.0, 1 30 br label %for.cond 31 32for.end: ; preds = %for.cond 33 ret <4 x i32> %res.0 34} 35 36define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 37entry: 38; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. 39; CHECK-LABEL: @test_cse2 40; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0) 41; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, ptr %a) 42 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 43 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 44 br label %for.cond 45 46for.cond: ; preds = %for.body, %entry 47 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 48 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 49 %cmp = icmp slt i32 %i.0, %n 50 br i1 %cmp, label %for.body, label %for.end 51 52for.body: ; preds = %for.cond 53 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 54 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 55 %2 = bitcast <16 x i8> %0 to <4 x i32> 56 %3 = bitcast <16 x i8> %1 to <4 x i32> 57 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %2, ptr %a) 58 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a) 59 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) 60 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 61 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 62 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 63 %inc = add nsw i32 %i.0, 1 64 br label %for.cond 65 66for.end: ; preds = %for.cond 67 ret <4 x i32> %res.0 68} 69 70define <4 x i32> @test_cse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { 71entry: 72; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 73; CHECK-LABEL: @test_cse3 74; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 75; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 76 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 77 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 78 br label %for.cond 79 80for.cond: ; preds = %for.body, %entry 81 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 82 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 83 %cmp = icmp slt i32 %i.0, %n 84 br i1 %cmp, label %for.body, label %for.end 85 86for.body: ; preds = %for.cond 87 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) 88 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 89 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 90 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) 91 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0 92 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1 93 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract) 94 %inc = add nsw i32 %i.0, 1 95 br label %for.cond 96 97for.end: ; preds = %for.cond 98 ret <4 x i32> %res.0 99} 100 101 102define <4 x i32> @test_nocse(ptr %a, ptr %b, [2 x <4 x i32>] %s.coerce, i32 %n) { 103entry: 104; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized 105; away by Early CSE. 106; CHECK-LABEL: @test_nocse 107; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 108 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 109 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 110 br label %for.cond 111 112for.cond: ; preds = %for.body, %entry 113 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 114 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 115 %cmp = icmp slt i32 %i.0, %n 116 br i1 %cmp, label %for.body, label %for.end 117 118for.body: ; preds = %for.cond 119 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 120 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 121 %2 = bitcast <16 x i8> %0 to <4 x i32> 122 %3 = bitcast <16 x i8> %1 to <4 x i32> 123 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a) 124 store i32 0, ptr %b, align 4 125 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) 126 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 127 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 128 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 129 %inc = add nsw i32 %i.0, 1 130 br label %for.cond 131 132for.end: ; preds = %for.cond 133 ret <4 x i32> %res.0 134} 135 136define <4 x i32> @test_nocse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 137entry: 138; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due 139; to mismatch between st2 and ld3. 140; CHECK-LABEL: @test_nocse2 141; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0 142 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 143 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 144 br label %for.cond 145 146for.cond: ; preds = %for.body, %entry 147 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 148 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 149 %cmp = icmp slt i32 %i.0, %n 150 br i1 %cmp, label %for.body, label %for.end 151 152for.body: ; preds = %for.cond 153 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 154 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 155 %2 = bitcast <16 x i8> %0 to <4 x i32> 156 %3 = bitcast <16 x i8> %1 to <4 x i32> 157 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a) 158 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a) 159 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 160 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 161 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract) 162 %inc = add nsw i32 %i.0, 1 163 br label %for.cond 164 165for.end: ; preds = %for.cond 166 ret <4 x i32> %res.0 167} 168 169define <4 x i32> @test_nocse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 170entry: 171; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to 172; mismatch between st2 and st3. 173; CHECK-LABEL: @test_nocse3 174; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0 175; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0 176 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 177 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 178 br label %for.cond 179 180for.cond: ; preds = %for.body, %entry 181 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 182 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 183 %cmp = icmp slt i32 %i.0, %n 184 br i1 %cmp, label %for.body, label %for.end 185 186for.body: ; preds = %for.cond 187 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 188 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 189 %2 = bitcast <16 x i8> %0 to <4 x i32> 190 %3 = bitcast <16 x i8> %1 to <4 x i32> 191 call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %3, <4 x i32> %2, <4 x i32> %2, ptr %a) 192 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %2, ptr %a) 193 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a) 194 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 195 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 196 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract) 197 %inc = add nsw i32 %i.0, 1 198 br label %for.cond 199 200for.end: ; preds = %for.cond 201 ret <4 x i32> %res.0 202} 203 204; Function Attrs: nounwind 205declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr nocapture) 206 207; Function Attrs: nounwind 208declare void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, ptr nocapture) 209 210; Function Attrs: nounwind readonly 211declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr) 212 213; Function Attrs: nounwind readonly 214declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr) 215 216define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { 217entry: 218 %add = add <4 x i32> %__p0, %__p1 219 ret <4 x i32> %add 220} 221