1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s 3; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s 4 5define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) { 6; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store( 7; NON-POW2-NEXT: entry: 8; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 9; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 10; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], splat (i32 10) 11; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 12; NON-POW2-NEXT: ret void 13; 14; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store( 15; POW2-ONLY-NEXT: entry: 16; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 17; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 18; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 19; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10 20; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4 21; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], splat (i32 10) 22; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 23; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 24; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 25; POW2-ONLY-NEXT: ret void 26; 27entry: 28 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 29 %l.src.0 = load i32, ptr %gep.src.0, align 4 30 %mul.0 = mul nsw i32 %l.src.0, 10 31 32 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 33 %l.src.1 = load i32, ptr %gep.src.1, align 4 34 %mul.1 = mul nsw i32 %l.src.1, 10 35 36 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 37 %l.src.2 = load i32, ptr %gep.src.2, align 4 38 %mul.2 = mul nsw i32 %l.src.2, 10 39 40 store i32 %mul.0, ptr %dst 41 42 %dst.1 = getelementptr i32, ptr %dst, i32 1 43 store i32 %mul.1, ptr %dst.1 44 45 %dst.2 = getelementptr i32, ptr %dst, i32 2 46 store i32 %mul.2, ptr %dst.2 47 48 ret void 49} 50 51; Should no be vectorized with a undef/poison element as padding, as 52; division by undef/poison may cause UB. Must use VL predication or 53; masking instead, where RISCV wins. 54define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) { 55; NON-POW2-LABEL: @v3_load_i32_udiv_by_constant_store( 56; NON-POW2-NEXT: entry: 57; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 58; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 59; NON-POW2-NEXT: [[TMP1:%.*]] = udiv <3 x i32> splat (i32 10), [[TMP0]] 60; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 61; NON-POW2-NEXT: ret void 62; 63; POW2-ONLY-LABEL: @v3_load_i32_udiv_by_constant_store( 64; POW2-ONLY-NEXT: entry: 65; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 66; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 67; POW2-ONLY-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]] 68; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 69; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 70; POW2-ONLY-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]] 71; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 72; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 73; POW2-ONLY-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]] 74; POW2-ONLY-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4 75; POW2-ONLY-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1 76; POW2-ONLY-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4 77; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 78; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 79; POW2-ONLY-NEXT: ret void 80; 81entry: 82 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 83 %l.src.0 = load i32, ptr %gep.src.0, align 4 84 %mul.0 = udiv i32 10, %l.src.0 85 86 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 87 %l.src.1 = load i32, ptr %gep.src.1, align 4 88 %mul.1 = udiv i32 10, %l.src.1 89 90 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 91 %l.src.2 = load i32, ptr %gep.src.2, align 4 92 %mul.2 = udiv i32 10, %l.src.2 93 94 store i32 %mul.0, ptr %dst 95 96 %dst.1 = getelementptr i32, ptr %dst, i32 1 97 store i32 %mul.1, ptr %dst.1 98 99 %dst.2 = getelementptr i32, ptr %dst, i32 2 100 store i32 %mul.2, ptr %dst.2 101 102 ret void 103} 104 105 106 107define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) { 108; NON-POW2-LABEL: @v3_load_i32_mul_store( 109; NON-POW2-NEXT: entry: 110; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 111; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 112; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4 113; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4 114; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]] 115; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 116; NON-POW2-NEXT: ret void 117; 118; POW2-ONLY-LABEL: @v3_load_i32_mul_store( 119; POW2-ONLY-NEXT: entry: 120; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 121; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 122; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 123; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 124; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 125; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 126; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] 127; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 128; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 129; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] 130; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 131; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 132; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 133; POW2-ONLY-NEXT: ret void 134; 135entry: 136 %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0 137 %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4 138 %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0 139 %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4 140 %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0 141 142 %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1 143 %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4 144 %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1 145 %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4 146 %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1 147 148 %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2 149 %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4 150 %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2 151 %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4 152 %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2 153 154 store i32 %mul.0, ptr %dst 155 156 %dst.1 = getelementptr i32, ptr %dst, i32 1 157 store i32 %mul.1, ptr %dst.1 158 159 %dst.2 = getelementptr i32, ptr %dst, i32 2 160 store i32 %mul.2, ptr %dst.2 161 162 ret void 163} 164 165define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) { 166; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store( 167; NON-POW2-NEXT: entry: 168; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 169; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 170; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4 171; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4 172; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]] 173; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], splat (i32 9) 174; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 175; NON-POW2-NEXT: ret void 176; 177; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store( 178; POW2-ONLY-NEXT: entry: 179; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 180; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 181; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 182; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 183; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 184; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 185; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] 186; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9 187; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 188; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 189; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] 190; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], splat (i32 9) 191; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 192; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 193; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4 194; POW2-ONLY-NEXT: ret void 195; 196entry: 197 %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0 198 %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4 199 %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0 200 %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4 201 %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0 202 %add.0 = add i32 %mul.0, 9 203 204 %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1 205 %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4 206 %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1 207 %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4 208 %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1 209 %add.1 = add i32 %mul.1, 9 210 211 %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2 212 %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4 213 %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2 214 %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4 215 %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2 216 %add.2 = add i32 %mul.2, 9 217 218 store i32 %add.0, ptr %dst 219 220 %dst.1 = getelementptr i32, ptr %dst, i32 1 221 store i32 %add.1, ptr %dst.1 222 223 %dst.2 = getelementptr i32, ptr %dst, i32 2 224 store i32 %add.2, ptr %dst.2 225 226 ret void 227} 228 229define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) { 230; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( 231; NON-POW2-NEXT: entry: 232; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 233; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 234; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01) 235; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4 236; NON-POW2-NEXT: ret void 237; 238; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( 239; POW2-ONLY-NEXT: entry: 240; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 241; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 242; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 243; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 244; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 245; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01) 246; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 247; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 248; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 249; POW2-ONLY-NEXT: ret void 250; 251entry: 252 %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0 253 %l.src.0 = load float , ptr %gep.src.0, align 4 254 %fadd.0 = fadd float %l.src.0, 10.0 255 256 %gep.src.1 = getelementptr inbounds float , ptr %src, i32 1 257 %l.src.1 = load float, ptr %gep.src.1, align 4 258 %fadd.1 = fadd float %l.src.1, 10.0 259 260 %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2 261 %l.src.2 = load float, ptr %gep.src.2, align 4 262 %fadd.2 = fadd float %l.src.2, 10.0 263 264 store float %fadd.0, ptr %dst 265 266 %dst.1 = getelementptr float, ptr %dst, i32 1 267 store float %fadd.1, ptr %dst.1 268 269 %dst.2 = getelementptr float, ptr %dst, i32 2 270 store float %fadd.2, ptr %dst.2 271 272 ret void 273} 274 275define void @phi_store3(ptr %dst) { 276; NON-POW2-LABEL: @phi_store3( 277; NON-POW2-NEXT: entry: 278; NON-POW2-NEXT: br label [[EXIT:%.*]] 279; NON-POW2: invoke.cont8.loopexit: 280; NON-POW2-NEXT: br label [[EXIT]] 281; NON-POW2: exit: 282; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] 283; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4 284; NON-POW2-NEXT: ret void 285; 286; POW2-ONLY-LABEL: @phi_store3( 287; POW2-ONLY-NEXT: entry: 288; POW2-ONLY-NEXT: br label [[EXIT:%.*]] 289; POW2-ONLY: invoke.cont8.loopexit: 290; POW2-ONLY-NEXT: br label [[EXIT]] 291; POW2-ONLY: exit: 292; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] 293; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ] 294; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2 295; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4 296; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4 297; POW2-ONLY-NEXT: ret void 298; 299entry: 300 br label %exit 301 302invoke.cont8.loopexit: ; No predecessors! 303 br label %exit 304 305exit: 306 %p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ] 307 %p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ] 308 %p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ] 309 310 %dst.1 = getelementptr i32, ptr %dst, i32 1 311 %dst.2 = getelementptr i32, ptr %dst, i32 2 312 313 store i32 %p.0, ptr %dst, align 4 314 store i32 %p.1, ptr %dst.1, align 4 315 store i32 %p.2, ptr %dst.2, align 4 316 ret void 317} 318 319define void @store_try_reorder(ptr %dst) { 320; NON-POW2-LABEL: @store_try_reorder( 321; NON-POW2-NEXT: entry: 322; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 323; NON-POW2-NEXT: ret void 324; 325; POW2-ONLY-LABEL: @store_try_reorder( 326; POW2-ONLY-NEXT: entry: 327; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 328; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 329; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 330; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 331; POW2-ONLY-NEXT: ret void 332; 333entry: 334 %add = add i32 0, 0 335 store i32 %add, ptr %dst, align 4 336 %add207 = sub i32 0, 0 337 %arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1 338 store i32 %add207, ptr %arrayidx.i1887, align 4 339 %add216 = sub i32 0, 0 340 %arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2 341 store i32 %add216, ptr %arrayidx.i1891, align 4 342 ret void 343} 344 345define void @vec3_fpext_cost(ptr %Colour, float %0) { 346; NON-POW2-LABEL: @vec3_fpext_cost( 347; NON-POW2-NEXT: entry: 348; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 0 349; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer 350; NON-POW2-NEXT: [[TMP3:%.*]] = fpext <3 x float> [[TMP2]] to <3 x double> 351; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> [[TMP3]], <3 x double> zeroinitializer, <3 x double> zeroinitializer) 352; NON-POW2-NEXT: [[TMP5:%.*]] = fptrunc <3 x double> [[TMP4]] to <3 x float> 353; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR:%.*]], align 4 354; NON-POW2-NEXT: ret void 355; 356; POW2-ONLY-LABEL: @vec3_fpext_cost( 357; POW2-ONLY-NEXT: entry: 358; POW2-ONLY-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2 359; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0 360; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer 361; POW2-ONLY-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> 362; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer) 363; POW2-ONLY-NEXT: [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float> 364; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4 365; POW2-ONLY-NEXT: [[CONV78:%.*]] = fpext float [[TMP0]] to double 366; POW2-ONLY-NEXT: [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00) 367; POW2-ONLY-NEXT: [[CONV82:%.*]] = fptrunc double [[TMP6]] to float 368; POW2-ONLY-NEXT: store float [[CONV82]], ptr [[ARRAYIDX80]], align 4 369; POW2-ONLY-NEXT: ret void 370; 371entry: 372 %arrayidx72 = getelementptr float, ptr %Colour, i64 1 373 %arrayidx80 = getelementptr float, ptr %Colour, i64 2 374 %conv62 = fpext float %0 to double 375 %1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00) 376 %conv66 = fptrunc double %1 to float 377 store float %conv66, ptr %Colour, align 4 378 %conv70 = fpext float %0 to double 379 %2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00) 380 %conv74 = fptrunc double %2 to float 381 store float %conv74, ptr %arrayidx72, align 4 382 %conv78 = fpext float %0 to double 383 %3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00) 384 %conv82 = fptrunc double %3 to float 385 store float %conv82, ptr %arrayidx80, align 4 386 ret void 387} 388 389define void @fpext_scatter(ptr %dst, double %conv) { 390; CHECK-LABEL: @fpext_scatter( 391; CHECK-NEXT: entry: 392; CHECK-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float 393; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0 394; CHECK-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4 395; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1 396; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4 397; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2 398; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX37]], align 4 399; CHECK-NEXT: ret void 400; 401entry: 402 %conv25 = fptrunc double %conv to float 403 %Lengths = getelementptr float, ptr %dst, i64 0 404 store float %conv25, ptr %Lengths, align 4 405 %arrayidx32 = getelementptr float, ptr %dst, i64 1 406 store float %conv25, ptr %arrayidx32, align 4 407 %arrayidx37 = getelementptr float, ptr %dst, i64 2 408 store float %conv25, ptr %arrayidx37, align 4 409 ret void 410} 411 412define i32 @reduce_add(ptr %src) { 413; CHECK-LABEL: @reduce_add( 414; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 415; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 416; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 417; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 418; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 419; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 420; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[L_SRC_0]], [[L_SRC_1]] 421; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[L_SRC_2]] 422; CHECK-NEXT: ret i32 [[ADD_1]] 423; 424 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 425 %l.src.0 = load i32, ptr %gep.src.0, align 4 426 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 427 %l.src.1 = load i32, ptr %gep.src.1, align 4 428 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 429 %l.src.2 = load i32, ptr %gep.src.2, align 4 430 431 %add.0 = add i32 %l.src.0, %l.src.1 432 %add.1 = add i32 %add.0, %l.src.2 433 ret i32 %add.1 434} 435 436define float @reduce_fadd(ptr %src) { 437; NON-POW2-LABEL: @reduce_fadd( 438; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 439; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 440; NON-POW2-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP1]]) 441; NON-POW2-NEXT: ret float [[TMP2]] 442; 443; POW2-ONLY-LABEL: @reduce_fadd( 444; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 445; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4 446; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1 447; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 448; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 449; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 450; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]] 451; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]] 452; POW2-ONLY-NEXT: ret float [[ADD_1]] 453; 454 %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0 455 %l.src.0 = load float, ptr %gep.src.0, align 4 456 %gep.src.1 = getelementptr inbounds float, ptr %src, i32 1 457 %l.src.1 = load float, ptr %gep.src.1, align 4 458 %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2 459 %l.src.2 = load float, ptr %gep.src.2, align 4 460 461 %add.0 = fadd fast float %l.src.0, %l.src.1 462 %add.1 = fadd fast float %add.0, %l.src.2 463 ret float %add.1 464} 465 466define i32 @reduce_add_after_mul(ptr %src) { 467; NON-POW2-LABEL: @reduce_add_after_mul( 468; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 469; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 470; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], splat (i32 10) 471; NON-POW2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]]) 472; NON-POW2-NEXT: ret i32 [[TMP3]] 473; 474; POW2-ONLY-LABEL: @reduce_add_after_mul( 475; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 476; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 477; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 478; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 479; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 480; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 481; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10 482; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10 483; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10 484; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 485; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] 486; POW2-ONLY-NEXT: ret i32 [[ADD_1]] 487; 488 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 489 %l.src.0 = load i32, ptr %gep.src.0, align 4 490 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 491 %l.src.1 = load i32, ptr %gep.src.1, align 4 492 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 493 %l.src.2 = load i32, ptr %gep.src.2, align 4 494 495 %mul.0 = mul nsw i32 %l.src.0, 10 496 %mul.1 = mul nsw i32 %l.src.1, 10 497 %mul.2 = mul nsw i32 %l.src.2, 10 498 499 %add.0 = add i32 %mul.0, %mul.1 500 %add.1 = add i32 %add.0, %mul.2 501 ret i32 %add.1 502} 503 504define i32 @dot_product_i32(ptr %a, ptr %b) { 505; NON-POW2-LABEL: @dot_product_i32( 506; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 507; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 508; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4 509; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4 510; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]] 511; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]]) 512; NON-POW2-NEXT: ret i32 [[TMP4]] 513; 514; POW2-ONLY-LABEL: @dot_product_i32( 515; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 516; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 517; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 518; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 519; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 520; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 521; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 522; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 523; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 524; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 525; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 526; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 527; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] 528; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] 529; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] 530; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 531; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] 532; POW2-ONLY-NEXT: ret i32 [[ADD_1]] 533; 534 %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 535 %l.a.0 = load i32, ptr %gep.a.0, align 4 536 %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1 537 %l.a.1 = load i32, ptr %gep.a.1, align 4 538 %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2 539 %l.a.2 = load i32, ptr %gep.a.2, align 4 540 541 %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0 542 %l.b.0 = load i32, ptr %gep.b.0, align 4 543 %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1 544 %l.b.1 = load i32, ptr %gep.b.1, align 4 545 %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2 546 %l.b.2 = load i32, ptr %gep.b.2, align 4 547 548 %mul.0 = mul nsw i32 %l.a.0, %l.b.0 549 %mul.1 = mul nsw i32 %l.a.1, %l.b.1 550 %mul.2 = mul nsw i32 %l.a.2, %l.b.2 551 552 %add.0 = add i32 %mul.0, %mul.1 553 %add.1 = add i32 %add.0, %mul.2 554 ret i32 %add.1 555} 556 557; Same as above, except the reduction order has been perturbed. This 558; is checking for our ability to reorder. 559define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { 560; NON-POW2-LABEL: @dot_product_i32_reorder( 561; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 562; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 563; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4 564; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4 565; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]] 566; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]]) 567; NON-POW2-NEXT: ret i32 [[TMP4]] 568; 569; POW2-ONLY-LABEL: @dot_product_i32_reorder( 570; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 571; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 572; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 573; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 574; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 575; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 576; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 577; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 578; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 579; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 580; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 581; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 582; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] 583; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] 584; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] 585; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] 586; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] 587; POW2-ONLY-NEXT: ret i32 [[ADD_1]] 588; 589 %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 590 %l.a.0 = load i32, ptr %gep.a.0, align 4 591 %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1 592 %l.a.1 = load i32, ptr %gep.a.1, align 4 593 %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2 594 %l.a.2 = load i32, ptr %gep.a.2, align 4 595 596 %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0 597 %l.b.0 = load i32, ptr %gep.b.0, align 4 598 %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1 599 %l.b.1 = load i32, ptr %gep.b.1, align 4 600 %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2 601 %l.b.2 = load i32, ptr %gep.b.2, align 4 602 603 %mul.0 = mul nsw i32 %l.a.0, %l.b.0 604 %mul.1 = mul nsw i32 %l.a.1, %l.b.1 605 %mul.2 = mul nsw i32 %l.a.2, %l.b.2 606 607 %add.0 = add i32 %mul.1, %mul.0 608 %add.1 = add i32 %add.0, %mul.2 609 ret i32 %add.1 610} 611 612define float @dot_product_fp32(ptr %a, ptr %b) { 613; NON-POW2-LABEL: @dot_product_fp32( 614; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 615; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 616; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 617; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 618; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] 619; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) 620; NON-POW2-NEXT: ret float [[TMP4]] 621; 622; POW2-ONLY-LABEL: @dot_product_fp32( 623; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 624; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 625; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 626; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 627; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 628; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 629; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 630; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 631; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] 632; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] 633; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 634; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 635; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]] 636; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] 637; POW2-ONLY-NEXT: ret float [[ADD_1]] 638; 639 %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 640 %l.a.0 = load float, ptr %gep.a.0, align 4 641 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1 642 %l.a.1 = load float, ptr %gep.a.1, align 4 643 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2 644 %l.a.2 = load float, ptr %gep.a.2, align 4 645 646 %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0 647 %l.b.0 = load float, ptr %gep.b.0, align 4 648 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1 649 %l.b.1 = load float, ptr %gep.b.1, align 4 650 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2 651 %l.b.2 = load float, ptr %gep.b.2, align 4 652 653 %mul.0 = fmul fast float %l.a.0, %l.b.0 654 %mul.1 = fmul fast float %l.a.1, %l.b.1 655 %mul.2 = fmul fast float %l.a.2, %l.b.2 656 657 %add.0 = fadd fast float %mul.0, %mul.1 658 %add.1 = fadd fast float %add.0, %mul.2 659 ret float %add.1 660} 661 662; Same as above, except the reduction order has been perturbed. This 663; is checking for our ability to reorder. 664define float @dot_product_fp32_reorder(ptr %a, ptr %b) { 665; NON-POW2-LABEL: @dot_product_fp32_reorder( 666; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 667; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 668; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 669; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 670; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] 671; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) 672; NON-POW2-NEXT: ret float [[TMP4]] 673; 674; POW2-ONLY-LABEL: @dot_product_fp32_reorder( 675; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 676; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 677; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 678; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 679; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 680; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 681; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 682; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 683; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] 684; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] 685; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 686; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 687; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] 688; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] 689; POW2-ONLY-NEXT: ret float [[ADD_1]] 690; 691 %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 692 %l.a.0 = load float, ptr %gep.a.0, align 4 693 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1 694 %l.a.1 = load float, ptr %gep.a.1, align 4 695 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2 696 %l.a.2 = load float, ptr %gep.a.2, align 4 697 698 %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0 699 %l.b.0 = load float, ptr %gep.b.0, align 4 700 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1 701 %l.b.1 = load float, ptr %gep.b.1, align 4 702 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2 703 %l.b.2 = load float, ptr %gep.b.2, align 4 704 705 %mul.0 = fmul fast float %l.a.0, %l.b.0 706 %mul.1 = fmul fast float %l.a.1, %l.b.1 707 %mul.2 = fmul fast float %l.a.2, %l.b.2 708 709 %add.0 = fadd fast float %mul.1, %mul.0 710 %add.1 = fadd fast float %add.0, %mul.2 711 ret float %add.1 712} 713 714 715define double @dot_product_fp64(ptr %a, ptr %b) { 716; NON-POW2-LABEL: @dot_product_fp64( 717; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 718; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 719; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4 720; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4 721; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]] 722; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) 723; NON-POW2-NEXT: ret double [[TMP4]] 724; 725; POW2-ONLY-LABEL: @dot_product_fp64( 726; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 727; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2 728; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4 729; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 730; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2 731; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4 732; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4 733; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4 734; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] 735; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] 736; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 737; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 738; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]] 739; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] 740; POW2-ONLY-NEXT: ret double [[ADD_1]] 741; 742 %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0 743 %l.a.0 = load double, ptr %gep.a.0, align 4 744 %gep.a.1 = getelementptr inbounds double, ptr %a, i32 1 745 %l.a.1 = load double, ptr %gep.a.1, align 4 746 %gep.a.2 = getelementptr inbounds double, ptr %a, i32 2 747 %l.a.2 = load double, ptr %gep.a.2, align 4 748 749 %gep.b.0 = getelementptr inbounds double, ptr %b, i32 0 750 %l.b.0 = load double, ptr %gep.b.0, align 4 751 %gep.b.1 = getelementptr inbounds double, ptr %b, i32 1 752 %l.b.1 = load double, ptr %gep.b.1, align 4 753 %gep.b.2 = getelementptr inbounds double, ptr %b, i32 2 754 %l.b.2 = load double, ptr %gep.b.2, align 4 755 756 %mul.0 = fmul fast double %l.a.0, %l.b.0 757 %mul.1 = fmul fast double %l.a.1, %l.b.1 758 %mul.2 = fmul fast double %l.a.2, %l.b.2 759 760 %add.0 = fadd fast double %mul.0, %mul.1 761 %add.1 = fadd fast double %add.0, %mul.2 762 ret double %add.1 763} 764 765;; Covers a case where SLP would previous crash due to a 766;; missing bailout in TryToFindDuplicates for the case 767;; where a VL=3 list was vectorized directly (without 768;; a root instruction such as a store or reduce). 769define double @no_root_reshuffle(ptr %ptr) { 770; CHECK-LABEL: @no_root_reshuffle( 771; CHECK-NEXT: entry: 772; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8 773; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]] 774; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 775; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 776; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16 777; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 778; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] 779; CHECK-NEXT: [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]] 780; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]] 781; CHECK-NEXT: ret double [[ADD]] 782; 783entry: 784 %0 = load double, ptr %ptr, align 8 785 %mul = fmul fast double %0, %0 786 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8 787 %1 = load double, ptr %arrayidx2, align 8 788 %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16 789 %2 = load double, ptr %arrayidx3, align 8 790 %3 = fmul fast double %2, %2 791 %mul6 = fmul fast double %3, %1 792 %add = fadd fast double %mul6, %mul 793 ret double %add 794} 795 796define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) { 797; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec( 798; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0 799; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1 800; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2 801; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01) 802; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]]) 803; NON-POW2-NEXT: ret float [[TMP5]] 804; 805; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec( 806; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 807; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 808; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 809; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] 810; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] 811; POW2-ONLY-NEXT: ret float [[ADD_1]] 812; 813 %mul.0 = fmul fast float %a, 10.0 814 %mul.1 = fmul fast float %b, 10.0 815 %mul.2 = fmul fast float %c, 10.0 816 817 %add.0 = fadd fast float %mul.0, %mul.1 818 %add.1 = fadd fast float %add.0, %mul.2 819 ret float %add.1 820} 821 822 823declare float @llvm.fmuladd.f32(float, float, float) 824 825declare double @llvm.fmuladd.f64(double, double, double) 826