1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s 3 4target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" 5target triple = "aarch64--linux-gnu" 6 7define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { 8; CHECK-LABEL: @build_vec_v2i64( 9; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] 10; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] 11; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2> 12; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3> 13; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] 14; CHECK-NEXT: ret <2 x i64> [[TMP5]] 15; 16 %v0.0 = extractelement <2 x i64> %v0, i32 0 17 %v0.1 = extractelement <2 x i64> %v0, i32 1 18 %v1.0 = extractelement <2 x i64> %v1, i32 0 19 %v1.1 = extractelement <2 x i64> %v1, i32 1 20 %tmp0.0 = add i64 %v0.0, %v1.0 21 %tmp0.1 = add i64 %v0.1, %v1.1 22 %tmp1.0 = sub i64 %v0.0, %v1.0 23 %tmp1.1 = sub i64 %v0.1, %v1.1 24 %tmp2.0 = add i64 %tmp0.0, %tmp0.1 25 %tmp2.1 = add i64 %tmp1.0, %tmp1.1 26 %tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0 27 %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1 28 ret <2 x i64> %tmp3.1 29} 30 31define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) { 32; CHECK-LABEL: @store_chain_v2i64( 33; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 34; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 35; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] 36; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] 37; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 1, i32 2> 38; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3> 39; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]] 40; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8 41; CHECK-NEXT: ret void 42; 43 %a.1 = getelementptr i64, ptr %a, i64 1 44 %b.1 = getelementptr i64, ptr %b, i64 1 45 %c.1 = getelementptr i64, ptr %c, i64 1 46 %v0.0 = load i64, ptr %a, align 8 47 %v0.1 = load i64, ptr %a.1, align 8 48 %v1.0 = load i64, ptr %b, align 8 49 %v1.1 = load i64, ptr %b.1, align 8 50 %tmp0.0 = add i64 %v0.0, %v1.0 51 %tmp0.1 = add i64 %v0.1, %v1.1 52 %tmp1.0 = sub i64 %v0.0, %v1.0 53 %tmp1.1 = sub i64 %v0.1, %v1.1 54 %tmp2.0 = add i64 %tmp0.0, %tmp0.1 55 %tmp2.1 = add i64 %tmp1.0, %tmp1.1 56 store i64 %tmp2.0, ptr %c, align 8 57 store i64 %tmp2.1, ptr %c.1, align 8 58 ret void 59} 60 61define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { 62; CHECK-LABEL: @build_vec_v4i32( 63; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] 64; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] 65; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6> 66; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7> 67; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] 68; CHECK-NEXT: ret <4 x i32> [[TMP5]] 69; 70 %v0.0 = extractelement <4 x i32> %v0, i32 0 71 %v0.1 = extractelement <4 x i32> %v0, i32 1 72 %v0.2 = extractelement <4 x i32> %v0, i32 2 73 %v0.3 = extractelement <4 x i32> %v0, i32 3 74 %v1.0 = extractelement <4 x i32> %v1, i32 0 75 %v1.1 = extractelement <4 x i32> %v1, i32 1 76 %v1.2 = extractelement <4 x i32> %v1, i32 2 77 %v1.3 = extractelement <4 x i32> %v1, i32 3 78 %tmp0.0 = add i32 %v0.0, %v1.0 79 %tmp0.1 = add i32 %v0.1, %v1.1 80 %tmp0.2 = add i32 %v0.2, %v1.2 81 %tmp0.3 = add i32 %v0.3, %v1.3 82 %tmp1.0 = sub i32 %v0.0, %v1.0 83 %tmp1.1 = sub i32 %v0.1, %v1.1 84 %tmp1.2 = sub i32 %v0.2, %v1.2 85 %tmp1.3 = sub i32 %v0.3, %v1.3 86 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 87 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 88 %tmp2.2 = add i32 %tmp0.2, %tmp0.3 89 %tmp2.3 = add i32 %tmp1.2, %tmp1.3 90 %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0 91 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1 92 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2 93 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3 94 ret <4 x i32> %tmp3.3 95} 96 97define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { 98; CHECK-LABEL: @build_vec_v4i32_reuse_0( 99; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] 100; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] 101; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2> 102; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3> 103; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] 104; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 105; CHECK-NEXT: ret <4 x i32> [[TMP6]] 106; 107 %v0.0 = extractelement <2 x i32> %v0, i32 0 108 %v0.1 = extractelement <2 x i32> %v0, i32 1 109 %v1.0 = extractelement <2 x i32> %v1, i32 0 110 %v1.1 = extractelement <2 x i32> %v1, i32 1 111 %tmp0.0 = add i32 %v0.0, %v1.0 112 %tmp0.1 = add i32 %v0.1, %v1.1 113 %tmp1.0 = sub i32 %v0.0, %v1.0 114 %tmp1.1 = sub i32 %v0.1, %v1.1 115 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 116 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 117 %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0 118 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1 119 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2 120 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3 121 ret <4 x i32> %tmp3.3 122} 123 124define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { 125; CHECK-LABEL: @build_vec_v4i32_reuse_1( 126; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 127; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1 128; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 129; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1 130; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] 131; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] 132; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] 133; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7> 134; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2> 135; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0 136; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3> 137; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] 138; CHECK-NEXT: ret <4 x i32> [[TMP9]] 139; 140 %v0.0 = extractelement <2 x i32> %v0, i32 0 141 %v0.1 = extractelement <2 x i32> %v0, i32 1 142 %v1.0 = extractelement <2 x i32> %v1, i32 0 143 %v1.1 = extractelement <2 x i32> %v1, i32 1 144 %tmp0.0 = add i32 %v0.0, %v1.0 145 %tmp0.1 = add i32 %v0.1, %v1.1 146 %tmp0.2 = xor i32 %v0.0, %v1.0 147 %tmp0.3 = xor i32 %v0.1, %v1.1 148 %tmp1.0 = sub i32 %tmp0.0, %tmp0.1 149 %tmp1.1 = sub i32 %tmp0.0, %tmp0.1 150 %tmp1.2 = sub i32 %tmp0.2, %tmp0.3 151 %tmp1.3 = sub i32 %tmp0.3, %tmp0.2 152 %tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0 153 %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1 154 %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2 155 %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3 156 ret <4 x i32> %tmp2.3 157} 158 159define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { 160; CHECK-LABEL: @build_vec_v4i32_3_binops( 161; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] 162; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]] 163; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2> 164; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3> 165; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] 166; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] 167; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0> 168; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] 169; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] 170; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> 171; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] 172; 173 %v0.0 = extractelement <2 x i32> %v0, i32 0 174 %v0.1 = extractelement <2 x i32> %v0, i32 1 175 %v1.0 = extractelement <2 x i32> %v1, i32 0 176 %v1.1 = extractelement <2 x i32> %v1, i32 1 177 %tmp0.0 = add i32 %v0.0, %v1.0 178 %tmp0.1 = add i32 %v0.1, %v1.1 179 %tmp0.2 = xor i32 %v0.0, %v1.0 180 %tmp0.3 = xor i32 %v0.1, %v1.1 181 %tmp1.0 = mul i32 %v0.0, %v1.0 182 %tmp1.1 = mul i32 %v0.1, %v1.1 183 %tmp1.2 = xor i32 %v0.0, %v1.0 184 %tmp1.3 = xor i32 %v0.1, %v1.1 185 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 186 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 187 %tmp2.2 = add i32 %tmp0.2, %tmp0.3 188 %tmp2.3 = add i32 %tmp1.2, %tmp1.3 189 %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0 190 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1 191 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2 192 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3 193 ret <4 x i32> %tmp3.3 194} 195 196define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { 197; CHECK-LABEL: @reduction_v4i32( 198; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] 199; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] 200; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2> 201; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3> 202; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] 203; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], splat (i32 15) 204; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], splat (i32 65537) 205; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], splat (i32 65535) 206; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] 207; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]] 208; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) 209; CHECK-NEXT: ret i32 [[TMP11]] 210; 211 %v0.0 = extractelement <4 x i32> %v0, i32 0 212 %v0.1 = extractelement <4 x i32> %v0, i32 1 213 %v0.2 = extractelement <4 x i32> %v0, i32 2 214 %v0.3 = extractelement <4 x i32> %v0, i32 3 215 %v1.0 = extractelement <4 x i32> %v1, i32 0 216 %v1.1 = extractelement <4 x i32> %v1, i32 1 217 %v1.2 = extractelement <4 x i32> %v1, i32 2 218 %v1.3 = extractelement <4 x i32> %v1, i32 3 219 %tmp0.0 = add i32 %v0.0, %v1.0 220 %tmp0.1 = add i32 %v0.1, %v1.1 221 %tmp0.2 = add i32 %v0.2, %v1.2 222 %tmp0.3 = add i32 %v0.3, %v1.3 223 %tmp1.0 = sub i32 %v0.0, %v1.0 224 %tmp1.1 = sub i32 %v0.1, %v1.1 225 %tmp1.2 = sub i32 %v0.2, %v1.2 226 %tmp1.3 = sub i32 %v0.3, %v1.3 227 %tmp2.0 = add i32 %tmp0.0, %tmp0.1 228 %tmp2.1 = add i32 %tmp1.0, %tmp1.1 229 %tmp2.2 = add i32 %tmp0.2, %tmp0.3 230 %tmp2.3 = add i32 %tmp1.2, %tmp1.3 231 %tmp3.0 = lshr i32 %tmp2.0, 15 232 %tmp3.1 = lshr i32 %tmp2.1, 15 233 %tmp3.2 = lshr i32 %tmp2.2, 15 234 %tmp3.3 = lshr i32 %tmp2.3, 15 235 %tmp4.0 = and i32 %tmp3.0, 65537 236 %tmp4.1 = and i32 %tmp3.1, 65537 237 %tmp4.2 = and i32 %tmp3.2, 65537 238 %tmp4.3 = and i32 %tmp3.3, 65537 239 %tmp5.0 = mul nuw i32 %tmp4.0, 65535 240 %tmp5.1 = mul nuw i32 %tmp4.1, 65535 241 %tmp5.2 = mul nuw i32 %tmp4.2, 65535 242 %tmp5.3 = mul nuw i32 %tmp4.3, 65535 243 %tmp6.0 = add i32 %tmp5.0, %tmp2.0 244 %tmp6.1 = add i32 %tmp5.1, %tmp2.1 245 %tmp6.2 = add i32 %tmp5.2, %tmp2.2 246 %tmp6.3 = add i32 %tmp5.3, %tmp2.3 247 %tmp7.0 = xor i32 %tmp6.0, %tmp5.0 248 %tmp7.1 = xor i32 %tmp6.1, %tmp5.1 249 %tmp7.2 = xor i32 %tmp6.2, %tmp5.2 250 %tmp7.3 = xor i32 %tmp6.3, %tmp5.3 251 %reduce.0 = add i32 %tmp7.1, %tmp7.0 252 %reduce.1 = add i32 %reduce.0, %tmp7.2 253 %reduce.2 = add i32 %reduce.1, %tmp7.3 254 ret i32 %reduce.2 255} 256