1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s 2 3 // REQUIRES: aarch64-registered-target 4 // RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE %s 5 6 typedef float float4 __attribute__((ext_vector_type(4))); 7 typedef short int si8 __attribute__((ext_vector_type(8))); 8 typedef unsigned int u4 __attribute__((ext_vector_type(4))); 9 10 __attribute__((address_space(1))) float4 vf1_as_one; 11 12 void test_builtin_reduce_max(float4 vf1, si8 vi1, u4 vu1) { 13 // CHECK-LABEL: define void @test_builtin_reduce_max( 14 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16 15 // CHECK-NEXT: call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[VF1]]) 16 float r1 = __builtin_reduce_max(vf1); 17 18 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 19 // CHECK-NEXT: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[VI1]]) 20 short r2 = __builtin_reduce_max(vi1); 21 22 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 23 // CHECK-NEXT: call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[VU1]]) 24 unsigned r3 = __builtin_reduce_max(vu1); 25 26 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16 27 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[VF1_AS1]]) 28 // CHECK-NEXT: fpext float [[RDX1]] to double 29 const double r4 = __builtin_reduce_max(vf1_as_one); 30 31 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16 32 // CHECK-NEXT: [[RDX2:%.+]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[CVI1]]) 33 // CHECK-NEXT: sext i16 [[RDX2]] to i64 34 const si8 cvi1 = vi1; 35 unsigned long long r5 = __builtin_reduce_max(cvi1); 36 } 37 38 void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) { 39 // CHECK-LABEL: define void @test_builtin_reduce_min( 40 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16 41 // CHECK-NEXT: call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[VF1]]) 42 float r1 = __builtin_reduce_min(vf1); 43 44 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 45 // CHECK-NEXT: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[VI1]]) 46 short r2 = __builtin_reduce_min(vi1); 47 48 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 49 // CHECK-NEXT: call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[VU1]]) 50 unsigned r3 = __builtin_reduce_min(vu1); 51 52 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16 53 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[VF1_AS1]]) 54 // CHECK-NEXT: fpext float [[RDX1]] to double 55 const double r4 = __builtin_reduce_min(vf1_as_one); 56 57 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16 58 // CHECK-NEXT: [[RDX2:%.+]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[CVI1]]) 59 // CHECK-NEXT: sext i16 [[RDX2]] to i64 60 const si8 cvi1 = vi1; 61 unsigned long long r5 = __builtin_reduce_min(cvi1); 62 } 63 64 void test_builtin_reduce_add(si8 vi1, u4 vu1) { 65 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 66 // CHECK-NEXT: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VI1]]) 67 short r2 = __builtin_reduce_add(vi1); 68 69 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 70 // CHECK-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VU1]]) 71 unsigned r3 = __builtin_reduce_add(vu1); 72 73 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16 74 // CHECK-NEXT: [[RDX1:%.+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[CVI1]]) 75 // CHECK-NEXT: sext i16 [[RDX1]] to i32 76 const si8 cvi1 = vi1; 77 int r4 = __builtin_reduce_add(cvi1); 78 79 // CHECK: [[CVU1:%.+]] = load <4 x i32>, ptr %cvu1, align 16 80 // CHECK-NEXT: [[RDX2:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[CVU1]]) 81 // CHECK-NEXT: zext i32 [[RDX2]] to i64 82 const u4 cvu1 = vu1; 83 unsigned long long r5 = __builtin_reduce_add(cvu1); 84 } 85 86 void test_builtin_reduce_mul(si8 vi1, u4 vu1) { 87 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 88 // CHECK-NEXT: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[VI1]]) 89 short r2 = __builtin_reduce_mul(vi1); 90 91 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 92 // CHECK-NEXT: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VU1]]) 93 unsigned r3 = __builtin_reduce_mul(vu1); 94 95 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16 96 // CHECK-NEXT: [[RDX1:%.+]] = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[CVI1]]) 97 // CHECK-NEXT: sext i16 [[RDX1]] to i32 98 const si8 cvi1 = vi1; 99 int r4 = __builtin_reduce_mul(cvi1); 100 101 // CHECK: [[CVU1:%.+]] = load <4 x i32>, ptr %cvu1, align 16 102 // CHECK-NEXT: [[RDX2:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[CVU1]]) 103 // CHECK-NEXT: zext i32 [[RDX2]] to i64 104 const u4 cvu1 = vu1; 105 unsigned long long r5 = __builtin_reduce_mul(cvu1); 106 } 107 108 void test_builtin_reduce_xor(si8 vi1, u4 vu1) { 109 110 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 111 // CHECK-NEXT: call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> [[VI1]]) 112 short r2 = __builtin_reduce_xor(vi1); 113 114 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 115 // CHECK-NEXT: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[VU1]]) 116 unsigned r3 = __builtin_reduce_xor(vu1); 117 } 118 119 void test_builtin_reduce_or(si8 vi1, u4 vu1) { 120 121 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 122 // CHECK-NEXT: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[VI1]]) 123 short r2 = __builtin_reduce_or(vi1); 124 125 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 126 // CHECK-NEXT: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[VU1]]) 127 unsigned r3 = __builtin_reduce_or(vu1); 128 } 129 130 void test_builtin_reduce_and(si8 vi1, u4 vu1) { 131 132 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 133 // CHECK-NEXT: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[VI1]]) 134 short r2 = __builtin_reduce_and(vi1); 135 136 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 137 // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]]) 138 unsigned r3 = __builtin_reduce_and(vu1); 139 } 140 141 void test_builtin_reduce_maximum(float4 vf1) { 142 // CHECK-LABEL: define void @test_builtin_reduce_maximum( 143 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16 144 // CHECK-NEXT: call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[VF1]]) 145 float r1 = __builtin_reduce_maximum(vf1); 146 147 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16 148 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[VF1_AS1]]) 149 // CHECK-NEXT: fpext float [[RDX1]] to double 150 const double r4 = __builtin_reduce_maximum(vf1_as_one); 151 } 152 153 void test_builtin_reduce_minimum(float4 vf1) { 154 // CHECK-LABEL: define void @test_builtin_reduce_minimum( 155 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16 156 // CHECK-NEXT: call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[VF1]]) 157 float r1 = __builtin_reduce_minimum(vf1); 158 159 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16 160 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[VF1_AS1]]) 161 // CHECK-NEXT: fpext float [[RDX1]] to double 162 const double r4 = __builtin_reduce_minimum(vf1_as_one); 163 } 164 165 #if defined(__ARM_FEATURE_SVE) 166 #include <arm_sve.h> 167 168 void test_builtin_reduce_SVE(int a, unsigned long long b, short c, float d) { 169 // SVE-LABEL: void @test_builtin_reduce_SVE( 170 171 svint32_t vec_a = svdup_s32(a); 172 svuint64_t vec_b = svdup_u64(b); 173 svint16_t vec_c1 = svdup_s16(c); 174 svuint16_t vec_c2 = svdup_u16(c); 175 svfloat32_t vec_d = svdup_f32(d); 176 177 // SVE: [[VF1:%.+]] = load <vscale x 4 x i32>, ptr %vec_a 178 // SVE-NEXT: call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[VF1]]) 179 int r1 = __builtin_reduce_add(vec_a); 180 181 // SVE: [[VF2:%.+]] = load <vscale x 4 x i32>, ptr %vec_a 182 // SVE-NEXT: call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[VF2]]) 183 int r2 = __builtin_reduce_mul(vec_a); 184 185 // SVE: [[VF3:%.+]] = load <vscale x 2 x i64>, ptr %vec_b 186 // SVE-NEXT: call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> [[VF3]]) 187 long long r3 = __builtin_reduce_xor(vec_b); 188 189 // SVE: [[VF4:%.+]] = load <vscale x 2 x i64>, ptr %vec_b 190 // SVE-NEXT: call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[VF4]]) 191 long long r4 = __builtin_reduce_or(vec_b); 192 193 // SVE: [[VF5:%.+]] = load <vscale x 2 x i64>, ptr %vec_b 194 // SVE-NEXT: call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[VF5]]) 195 long long r5 = __builtin_reduce_and(vec_b); 196 197 // SVE: [[VF6:%.+]] = load <vscale x 8 x i16>, ptr %vec_c1 198 // SVE-NEXT: call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> [[VF6]]) 199 short r6 = __builtin_reduce_max(vec_c1); 200 201 // SVE: [[VF7:%.+]] = load <vscale x 8 x i16>, ptr %vec_c2 202 // SVE-NEXT: call i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16> [[VF7]]) 203 unsigned short r7 = __builtin_reduce_min(vec_c2); 204 205 // SVE: [[VF8:%.+]] = load <vscale x 4 x float>, ptr %vec_d 206 // SVE-NEXT: call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[VF8]]) 207 float r8 = __builtin_reduce_max(vec_d); 208 209 // SVE: [[VF9:%.+]] = load <vscale x 4 x float>, ptr %vec_d 210 // SVE-NEXT: call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[VF9]]) 211 float r9 = __builtin_reduce_min(vec_d); 212 } 213 #endif 214