1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s 3 4%struct.S = type { [8 x i32], [8 x i32], [16 x i32] } 5 6define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { 7; CHECK-LABEL: @_Z4testP1S( 8; CHECK-NEXT: entry: 9; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 10; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 11; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 12; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 13; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 14; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 15; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 16; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 17; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> 18; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> <i32 1, i32 7, i32 6, i32 4, i32 poison, i32 poison, i32 0, i32 5> 19; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) 20; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] 21; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 22; CHECK-NEXT: ret void 23; 24entry: 25 %arrayidx = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 0 26 %i = load i32, ptr %arrayidx, align 4 27 %arrayidx1 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 15 28 %i1 = load i32, ptr %arrayidx1, align 4 29 %add = add nsw i32 %i1, %i 30 store i32 %add, ptr %p, align 4 31 %arrayidx4 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 1 32 %i2 = load i32, ptr %arrayidx4, align 4 33 %arrayidx6 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 7 34 %i3 = load i32, ptr %arrayidx6, align 4 35 %add7 = add nsw i32 %i3, %i2 36 %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1 37 store i32 %add7, ptr %arrayidx9, align 4 38 %arrayidx11 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 2 39 %i4 = load i32, ptr %arrayidx11, align 4 40 %arrayidx13 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 6 41 %i5 = load i32, ptr %arrayidx13, align 4 42 %add14 = add nsw i32 %i5, %i4 43 %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2 44 store i32 %add14, ptr %arrayidx16, align 4 45 %arrayidx18 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 3 46 %i6 = load i32, ptr %arrayidx18, align 4 47 %arrayidx20 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 4 48 %i7 = load i32, ptr %arrayidx20, align 4 49 %add21 = add nsw i32 %i7, %i6 50 %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3 51 store i32 %add21, ptr %arrayidx23, align 4 52 %arrayidx25 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 4 53 %i8 = load i32, ptr %arrayidx25, align 4 54 %arrayidx27 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 12 55 %i9 = load i32, ptr %arrayidx27, align 4 56 %add28 = add nsw i32 %i9, %i8 57 %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4 58 store i32 %add28, ptr %arrayidx30, align 4 59 %arrayidx32 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 5 60 %i10 = load i32, ptr %arrayidx32, align 4 61 %arrayidx34 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 13 62 %i11 = load i32, ptr %arrayidx34, align 4 63 %add35 = add nsw i32 %i11, %i10 64 %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5 65 store i32 %add35, ptr %arrayidx37, align 4 66 %arrayidx39 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 6 67 %i12 = load i32, ptr %arrayidx39, align 4 68 %arrayidx41 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 14 69 %i13 = load i32, ptr %arrayidx41, align 4 70 %add42 = add nsw i32 %i13, %i12 71 %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6 72 store i32 %add42, ptr %arrayidx44, align 4 73 %arrayidx46 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 7 74 %i14 = load i32, ptr %arrayidx46, align 4 75 %arrayidx48 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 5 76 %i15 = load i32, ptr %arrayidx48, align 4 77 %add49 = add nsw i32 %i15, %i14 78 %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7 79 store i32 %add49, ptr %arrayidx51, align 4 80 ret void 81} 82 83; Test for 2 load groups 4 elements each against different base pointers. 84; Both loaded groups are not ordered thus here are few specific points: 85; (1) these groups are detected, (2) reordereing shuffles generated and 86; (3) these loads vectorized as a part of tree that is seeded by stores 87; and with VF=8. 88 89define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_addr { 90; CHECK-LABEL: @test_unordered_splits( 91; CHECK-NEXT: entry: 92; CHECK-NEXT: [[P1:%.*]] = alloca [16 x i32], align 16 93; CHECK-NEXT: [[P2:%.*]] = alloca [16 x i32], align 16 94; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4 95; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12 96; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4 97; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4 98; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0) 99; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4) 100; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4> 101; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4 102; CHECK-NEXT: ret void 103; 104entry: 105 %p1 = alloca [16 x i32], align 16 106 %p2 = alloca [16 x i32], align 16 107 %g10 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 4 108 %g11 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 5 109 %g12 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 6 110 %g13 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 7 111 %g20 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 12 112 %g21 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 13 113 %g22 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 14 114 %g23 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 15 115 %i1 = load i32, ptr %g11, align 4 116 store i32 %i1, ptr %p, align 4 117 %i3 = load i32, ptr %g10, align 4 118 %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1 119 store i32 %i3, ptr %arrayidx9, align 4 120 %i5 = load i32, ptr %g12, align 4 121 %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2 122 store i32 %i5, ptr %arrayidx16, align 4 123 %i7 = load i32, ptr %g13, align 4 124 %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3 125 store i32 %i7, ptr %arrayidx23, align 4 126 %i9 = load i32, ptr %g23, align 4 127 %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4 128 store i32 %i9, ptr %arrayidx30, align 4 129 %i11 = load i32, ptr %g21, align 4 130 %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5 131 store i32 %i11, ptr %arrayidx37, align 4 132 %i13 = load i32, ptr %g22, align 4 133 %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6 134 store i32 %i13, ptr %arrayidx44, align 4 135 %i15 = load i32, ptr %g20, align 4 136 %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7 137 store i32 %i15, ptr %arrayidx51, align 4 138 ret void 139} 140 141define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr { 142; CHECK-LABEL: @test_cost_splits( 143; CHECK-NEXT: entry: 144; CHECK-NEXT: [[P1:%.*]] = alloca [16 x i32], align 16 145; CHECK-NEXT: [[P2:%.*]] = alloca [16 x i32], align 16 146; CHECK-NEXT: [[P3:%.*]] = alloca [16 x i32], align 16 147; CHECK-NEXT: [[P4:%.*]] = alloca [16 x i32], align 16 148; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4 149; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 6 150; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P3]], i32 0, i64 12 151; CHECK-NEXT: [[G22:%.*]] = getelementptr inbounds [16 x i32], ptr [[P4]], i32 0, i64 14 152; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[G10]], align 4 153; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4 154; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4 155; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4 156; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0) 157; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2) 158; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4) 159; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6) 160; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4 161; CHECK-NEXT: ret void 162; 163entry: 164 %p1 = alloca [16 x i32], align 16 165 %p2 = alloca [16 x i32], align 16 166 %p3 = alloca [16 x i32], align 16 167 %p4 = alloca [16 x i32], align 16 168 %g10 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 4 169 %g11 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 5 170 %g12 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 6 171 %g13 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 7 172 %g20 = getelementptr inbounds [16 x i32], ptr %p3, i32 0, i64 12 173 %g21 = getelementptr inbounds [16 x i32], ptr %p3, i32 0, i64 13 174 %g22 = getelementptr inbounds [16 x i32], ptr %p4, i32 0, i64 14 175 %g23 = getelementptr inbounds [16 x i32], ptr %p4, i32 0, i64 15 176 %i1 = load i32, ptr %g10, align 4 177 store i32 %i1, ptr %p, align 4 178 %i3 = load i32, ptr %g11, align 4 179 %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1 180 store i32 %i3, ptr %arrayidx9, align 4 181 %i5 = load i32, ptr %g12, align 4 182 %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2 183 store i32 %i5, ptr %arrayidx16, align 4 184 %i7 = load i32, ptr %g13, align 4 185 %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3 186 store i32 %i7, ptr %arrayidx23, align 4 187 %i9 = load i32, ptr %g20, align 4 188 %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4 189 store i32 %i9, ptr %arrayidx30, align 4 190 %i11 = load i32, ptr %g21, align 4 191 %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5 192 store i32 %i11, ptr %arrayidx37, align 4 193 %i13 = load i32, ptr %g22, align 4 194 %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6 195 store i32 %i13, ptr %arrayidx44, align 4 196 %i15 = load i32, ptr %g23, align 4 197 %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7 198 store i32 %i15, ptr %arrayidx51, align 4 199 ret void 200} 201