1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=instcombine -S | FileCheck %s 3 4declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 5declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 6declare float @llvm.vector.reduce.fmul.f32.nxv4f32(float, <vscale x 4 x float>) 7declare float @llvm.vector.reduce.fmin.f32.v4f32(float, <4 x float>) 8declare float @llvm.vector.reduce.fmax.f32.nxv4f32(float, <vscale x 4 x float>) 9declare void @use_f32(float) 10 11declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 12declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) 13declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 14declare void @use_i32(i32) 15 16declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) 17declare i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32>) 18 19declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) 20declare i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32>) 21declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) 22declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>) 23 24declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>) 25declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>) 26declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>) 27declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>) 28 29define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { 30; CHECK-LABEL: @diff_of_sums_v4f32( 31; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]] 32; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]]) 33; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]] 34; CHECK-NEXT: ret float [[R]] 35; 36 %r0 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) 37 %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) 38 %r = fsub reassoc nsz float %r0, %r1 39 ret float %r 40} 41 42define float @reassoc_sum_of_reverse_v4f32(<4 x float> %v0) { 43; CHECK-LABEL: @reassoc_sum_of_reverse_v4f32( 44; CHECK-NEXT: [[RED:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V0:%.*]]) 45; CHECK-NEXT: ret float [[RED]] 46; 47 %rev = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %v0) 48 %red = call reassoc float @llvm.vector.reduce.fadd.v4f32(float zeroinitializer, <4 x float> %rev) 49 ret float %red 50} 51 52define float @reassoc_mul_reduction_of_reverse_nxv4f32(<vscale x 4 x float> %v0) { 53; CHECK-LABEL: @reassoc_mul_reduction_of_reverse_nxv4f32( 54; CHECK-NEXT: [[RED:%.*]] = call reassoc float @llvm.vector.reduce.fmul.nxv4f32(float 1.000000e+00, <vscale x 4 x float> [[V0:%.*]]) 55; CHECK-NEXT: ret float [[RED]] 56; 57 %rev = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %v0) 58 %red = call reassoc float @llvm.vector.reduce.fmul.nxv4f32(float 1.0, <vscale x 4 x float> %rev) 59 ret float %red 60} 61 62define float @fmax_of_reverse_v4f32(<4 x float> %v0) { 63; CHECK-LABEL: @fmax_of_reverse_v4f32( 64; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[V0:%.*]]) 65; CHECK-NEXT: ret float [[RED]] 66; 67 %rev = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %v0) 68 %red = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %rev) 69 ret float %red 70} 71 72define float @fmin_of_reverse_nxv4f32(<vscale x 4 x float> %v0) { 73; CHECK-LABEL: @fmin_of_reverse_nxv4f32( 74; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[V0:%.*]]) 75; CHECK-NEXT: ret float [[RED]] 76; 77 %rev = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %v0) 78 %red = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %rev) 79 ret float %red 80} 81 82; negative test - fadd cannot be folded with reverse due to lack of reassoc 83define float @sum_of_reverse_v4f32(<4 x float> %v0) { 84; CHECK-LABEL: @sum_of_reverse_v4f32( 85; CHECK-NEXT: [[REV:%.*]] = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> [[V0:%.*]]) 86; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[REV]]) 87; CHECK-NEXT: ret float [[RED]] 88; 89 %rev = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %v0) 90 %red = call float @llvm.vector.reduce.fadd.v4f32(float zeroinitializer, <4 x float> %rev) 91 ret float %red 92} 93 94; negative test - fmul cannot be folded with reverse due to lack of reassoc 95define float @mul_reduction_of_reverse_nxv4f32(<vscale x 4 x float> %v0) { 96; CHECK-LABEL: @mul_reduction_of_reverse_nxv4f32( 97; CHECK-NEXT: [[REV:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[V0:%.*]]) 98; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[REV]]) 99; CHECK-NEXT: ret float [[RED]] 100; 101 %rev = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %v0) 102 %red = call float @llvm.vector.reduce.fmul.nxv4f32(float zeroinitializer, <vscale x 4 x float> %rev) 103 ret float %red 104} 105 106 107; negative test - fsub must allow reassociation 108 109define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { 110; CHECK-LABEL: @diff_of_sums_v4f32_fmf( 111; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) 112; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) 113; CHECK-NEXT: [[R:%.*]] = fsub nnan ninf nsz float [[R0]], [[R1]] 114; CHECK-NEXT: ret float [[R]] 115; 116 %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) 117 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) 118 %r = fsub ninf nnan nsz float %r0, %r1 119 ret float %r 120} 121 122; negative test - extra uses could create extra instructions 123 124define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { 125; CHECK-LABEL: @diff_of_sums_extra_use1( 126; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) 127; CHECK-NEXT: call void @use_f32(float [[R0]]) 128; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) 129; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]] 130; CHECK-NEXT: ret float [[R]] 131; 132 %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) 133 call void @use_f32(float %r0) 134 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) 135 %r = fsub fast float %r0, %r1 136 ret float %r 137} 138 139; negative test - extra uses could create extra instructions 140 141define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { 142; CHECK-LABEL: @diff_of_sums_extra_use2( 143; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) 144; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) 145; CHECK-NEXT: call void @use_f32(float [[R1]]) 146; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]] 147; CHECK-NEXT: ret float [[R]] 148; 149 %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) 150 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) 151 call void @use_f32(float %r1) 152 %r = fsub fast float %r0, %r1 153 ret float %r 154} 155 156; negative test - can't reassociate different vector types 157 158define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) { 159; CHECK-LABEL: @diff_of_sums_type_mismatch( 160; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) 161; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]]) 162; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]] 163; CHECK-NEXT: ret float [[R]] 164; 165 %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) 166 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a1, <8 x float> %v1) 167 %r = fsub fast float %r0, %r1 168 ret float %r 169} 170 171define i32 @diff_of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) { 172; CHECK-LABEL: @diff_of_sums_v4i32( 173; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] 174; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) 175; CHECK-NEXT: ret i32 [[R]] 176; 177 %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0) 178 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) 179 %r = sub i32 %r0, %r1 180 ret i32 %r 181} 182 183define i32 @sum_of_reverse_v4i32(<4 x i32> %v0) { 184; CHECK-LABEL: @sum_of_reverse_v4i32( 185; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) 186; CHECK-NEXT: ret i32 [[RED]] 187; 188 %rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0) 189 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %rev) 190 ret i32 %red 191} 192 193define i32 @sum_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) { 194; CHECK-LABEL: @sum_of_reverse_nxv4i32( 195; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[V0:%.*]]) 196; CHECK-NEXT: ret i32 [[RED]] 197; 198 %rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0) 199 %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %rev) 200 ret i32 %red 201} 202 203define i32 @mul_reduce_of_reverse_v4i32(<4 x i32> %v0) { 204; CHECK-LABEL: @mul_reduce_of_reverse_v4i32( 205; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[V0:%.*]]) 206; CHECK-NEXT: ret i32 [[RED]] 207; 208 %rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0) 209 %red = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %rev) 210 ret i32 %red 211} 212 213define i32 @mul_reduce_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) { 214; CHECK-LABEL: @mul_reduce_of_reverse_nxv4i32( 215; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[V0:%.*]]) 216; CHECK-NEXT: ret i32 [[RED]] 217; 218 %rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0) 219 %red = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %rev) 220 ret i32 %red 221} 222 223define i32 @smin_reduce_of_reverse_v4i32(<4 x i32> %v0) { 224; CHECK-LABEL: @smin_reduce_of_reverse_v4i32( 225; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[V0:%.*]]) 226; CHECK-NEXT: ret i32 [[RED]] 227; 228 %rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0) 229 %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %rev) 230 ret i32 %red 231} 232 233define i32 @smax_reduce_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) { 234; CHECK-LABEL: @smax_reduce_of_reverse_nxv4i32( 235; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[V0:%.*]]) 236; CHECK-NEXT: ret i32 [[RED]] 237; 238 %rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0) 239 %red = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %rev) 240 ret i32 %red 241} 242 243define i32 @umin_reduce_of_reverse_v4i32(<4 x i32> %v0) { 244; CHECK-LABEL: @umin_reduce_of_reverse_v4i32( 245; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[V0:%.*]]) 246; CHECK-NEXT: ret i32 [[RED]] 247; 248 %rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0) 249 %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %rev) 250 ret i32 %red 251} 252 253define i32 @umax_reduce_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) { 254; CHECK-LABEL: @umax_reduce_of_reverse_nxv4i32( 255; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[V0:%.*]]) 256; CHECK-NEXT: ret i32 [[RED]] 257; 258 %rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0) 259 %red = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %rev) 260 ret i32 %red 261} 262 263; negative test - extra uses could create extra instructions 264 265define i32 @diff_of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) { 266; CHECK-LABEL: @diff_of_sums_v4i32_extra_use1( 267; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) 268; CHECK-NEXT: call void @use_i32(i32 [[R0]]) 269; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) 270; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]] 271; CHECK-NEXT: ret i32 [[R]] 272; 273 %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0) 274 call void @use_i32(i32 %r0) 275 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) 276 %r = sub i32 %r0, %r1 277 ret i32 %r 278} 279 280; negative test - extra uses could create extra instructions 281 282define i32 @diff_of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) { 283; CHECK-LABEL: @diff_of_sums_v4i32_extra_use2( 284; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) 285; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) 286; CHECK-NEXT: call void @use_i32(i32 [[R1]]) 287; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]] 288; CHECK-NEXT: ret i32 [[R]] 289; 290 %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0) 291 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) 292 call void @use_i32(i32 %r1) 293 %r = sub i32 %r0, %r1 294 ret i32 %r 295} 296 297; negative test - can't reassociate different vector types 298 299define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) { 300; CHECK-LABEL: @diff_of_sums_type_mismatch2( 301; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]]) 302; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) 303; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]] 304; CHECK-NEXT: ret i32 [[R]] 305; 306 %r0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v0) 307 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) 308 %r = sub i32 %r0, %r1 309 ret i32 %r 310} 311