1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ 3; RUN: -mattr=+v,+zvfhmin,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ 4; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFHMIN 5; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ 6; RUN: -mattr=+v,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ 7; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL128 8; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ 9; RUN: -mattr=+v,+zvl256b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ 10; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL256 11; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ 12; RUN: -mattr=+v,+zvl512b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ 13; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL512 14 15target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" 16target triple = "riscv64" 17 18; First batch of tests are simple reductions of various widths 19 20define i64 @red_ld_2xi64(ptr %ptr) { 21; CHECK-LABEL: @red_ld_2xi64( 22; CHECK-NEXT: entry: 23; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 8 24; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 1 25; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP]], align 8 26; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[LD0]], [[LD1]] 27; CHECK-NEXT: ret i64 [[ADD_1]] 28; 29entry: 30 %ld0 = load i64, ptr %ptr 31 %gep = getelementptr inbounds i64, ptr %ptr, i64 1 32 %ld1 = load i64, ptr %gep 33 %add.1 = add nuw nsw i64 %ld0, %ld1 34 ret i64 %add.1 35} 36 37define i64 @red_ld_4xi64(ptr %ptr) { 38; CHECK-LABEL: @red_ld_4xi64( 39; CHECK-NEXT: entry: 40; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 8 41; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]]) 42; CHECK-NEXT: ret i64 [[TMP1]] 43; 44entry: 45 %ld0 = load i64, ptr %ptr 46 %gep = getelementptr inbounds i64, ptr %ptr, i64 1 47 %ld1 = load i64, ptr %gep 48 %add.1 = add nuw nsw i64 %ld0, %ld1 49 %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 2 50 %ld2 = load i64, ptr %gep.1 51 %add.2 = add nuw nsw i64 %add.1, %ld2 52 %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 3 53 %ld3 = load i64, ptr %gep.2 54 %add.3 = add nuw nsw i64 %add.2, %ld3 55 ret i64 %add.3 56} 57 58define i64 @red_ld_8xi64(ptr %ptr) { 59; CHECK-LABEL: @red_ld_8xi64( 60; CHECK-NEXT: entry: 61; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 8 62; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP0]]) 63; CHECK-NEXT: ret i64 [[TMP1]] 64; 65entry: 66 %ld0 = load i64, ptr %ptr 67 %gep = getelementptr inbounds i64, ptr %ptr, i64 1 68 %ld1 = load i64, ptr %gep 69 %add.1 = add nuw nsw i64 %ld0, %ld1 70 %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 2 71 %ld2 = load i64, ptr %gep.1 72 %add.2 = add nuw nsw i64 %add.1, %ld2 73 %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 3 74 %ld3 = load i64, ptr %gep.2 75 %add.3 = add nuw nsw i64 %add.2, %ld3 76 %gep.3 = getelementptr inbounds i64, ptr %ptr, i64 4 77 %ld4 = load i64, ptr %gep.3 78 %add.4 = add nuw nsw i64 %add.3, %ld4 79 %gep.4 = getelementptr inbounds i64, ptr %ptr, i64 5 80 %ld5 = load i64, ptr %gep.4 81 %add.5 = add nuw nsw i64 %add.4, %ld5 82 %gep.5 = getelementptr inbounds i64, ptr %ptr, i64 6 83 %ld6 = load i64, ptr %gep.5 84 %add.6 = add nuw nsw i64 %add.5, %ld6 85 %gep.6 = getelementptr inbounds i64, ptr %ptr, i64 7 86 %ld7 = load i64, ptr %gep.6 87 %add.7 = add nuw nsw i64 %add.6, %ld7 88 ret i64 %add.7 89} 90 91define i64 @red_ld_16xi64(ptr %ptr) { 92; CHECK-LABEL: @red_ld_16xi64( 93; CHECK-NEXT: entry: 94; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 8 95; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]]) 96; CHECK-NEXT: ret i64 [[TMP1]] 97; 98entry: 99 %ld0 = load i64, ptr %ptr 100 %gep = getelementptr inbounds i64, ptr %ptr, i64 1 101 %ld1 = load i64, ptr %gep 102 %add.1 = add nuw nsw i64 %ld0, %ld1 103 %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 2 104 %ld2 = load i64, ptr %gep.1 105 %add.2 = add nuw nsw i64 %add.1, %ld2 106 %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 3 107 %ld3 = load i64, ptr %gep.2 108 %add.3 = add nuw nsw i64 %add.2, %ld3 109 %gep.3 = getelementptr inbounds i64, ptr %ptr, i64 4 110 %ld4 = load i64, ptr %gep.3 111 %add.4 = add nuw nsw i64 %add.3, %ld4 112 %gep.4 = getelementptr inbounds i64, ptr %ptr, i64 5 113 %ld5 = load i64, ptr %gep.4 114 %add.5 = add nuw nsw i64 %add.4, %ld5 115 %gep.5 = getelementptr inbounds i64, ptr %ptr, i64 6 116 %ld6 = load i64, ptr %gep.5 117 %add.6 = add nuw nsw i64 %add.5, %ld6 118 %gep.6 = getelementptr inbounds i64, ptr %ptr, i64 7 119 %ld7 = load i64, ptr %gep.6 120 %add.7 = add nuw nsw i64 %add.6, %ld7 121 %gep.7 = getelementptr inbounds i64, ptr %ptr, i64 8 122 %ld8 = load i64, ptr %gep.7 123 %add.8 = add nuw nsw i64 %add.7, %ld8 124 %gep.8 = getelementptr inbounds i64, ptr %ptr, i64 9 125 %ld9 = load i64, ptr %gep.8 126 %add.9 = add nuw nsw i64 %add.8, %ld9 127 %gep.9 = getelementptr inbounds i64, ptr %ptr, i64 10 128 %ld10 = load i64, ptr %gep.9 129 %add.10 = add nuw nsw i64 %add.9, %ld10 130 %gep.10 = getelementptr inbounds i64, ptr %ptr, i64 11 131 %ld11 = load i64, ptr %gep.10 132 %add.11 = add nuw nsw i64 %add.10, %ld11 133 %gep.11 = getelementptr inbounds i64, ptr %ptr, i64 12 134 %ld12 = load i64, ptr %gep.11 135 %add.12 = add nuw nsw i64 %add.11, %ld12 136 %gep.12 = getelementptr inbounds i64, ptr %ptr, i64 13 137 %ld13 = load i64, ptr %gep.12 138 %add.13 = add nuw nsw i64 %add.12, %ld13 139 %gep.13 = getelementptr inbounds i64, ptr %ptr, i64 14 140 %ld14 = load i64, ptr %gep.13 141 %add.14 = add nuw nsw i64 %add.13, %ld14 142 %gep.14 = getelementptr inbounds i64, ptr %ptr, i64 15 143 %ld15 = load i64, ptr %gep.14 144 %add.15 = add nuw nsw i64 %add.14, %ld15 145 ret i64 %add.15 146} 147 148 149define i64 @red_strided_ld_16xi64(ptr %ptr) { 150; CHECK-LABEL: @red_strided_ld_16xi64( 151; CHECK-NEXT: entry: 152; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 [[PTR:%.*]], i64 16, <16 x i1> splat (i1 true), i32 16) 153; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]]) 154; CHECK-NEXT: ret i64 [[TMP1]] 155; 156entry: 157 %ld0 = load i64, ptr %ptr 158 %gep = getelementptr inbounds i64, ptr %ptr, i64 2 159 %ld1 = load i64, ptr %gep 160 %add.1 = add nuw nsw i64 %ld0, %ld1 161 %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 4 162 %ld2 = load i64, ptr %gep.1 163 %add.2 = add nuw nsw i64 %add.1, %ld2 164 %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 6 165 %ld3 = load i64, ptr %gep.2 166 %add.3 = add nuw nsw i64 %add.2, %ld3 167 %gep.3 = getelementptr inbounds i64, ptr %ptr, i64 8 168 %ld4 = load i64, ptr %gep.3 169 %add.4 = add nuw nsw i64 %add.3, %ld4 170 %gep.4 = getelementptr inbounds i64, ptr %ptr, i64 10 171 %ld5 = load i64, ptr %gep.4 172 %add.5 = add nuw nsw i64 %add.4, %ld5 173 %gep.5 = getelementptr inbounds i64, ptr %ptr, i64 12 174 %ld6 = load i64, ptr %gep.5 175 %add.6 = add nuw nsw i64 %add.5, %ld6 176 %gep.6 = getelementptr inbounds i64, ptr %ptr, i64 14 177 %ld7 = load i64, ptr %gep.6 178 %add.7 = add nuw nsw i64 %add.6, %ld7 179 %gep.7 = getelementptr inbounds i64, ptr %ptr, i64 16 180 %ld8 = load i64, ptr %gep.7 181 %add.8 = add nuw nsw i64 %add.7, %ld8 182 %gep.8 = getelementptr inbounds i64, ptr %ptr, i64 18 183 %ld9 = load i64, ptr %gep.8 184 %add.9 = add nuw nsw i64 %add.8, %ld9 185 %gep.9 = getelementptr inbounds i64, ptr %ptr, i64 20 186 %ld10 = load i64, ptr %gep.9 187 %add.10 = add nuw nsw i64 %add.9, %ld10 188 %gep.10 = getelementptr inbounds i64, ptr %ptr, i64 22 189 %ld11 = load i64, ptr %gep.10 190 %add.11 = add nuw nsw i64 %add.10, %ld11 191 %gep.11 = getelementptr inbounds i64, ptr %ptr, i64 24 192 %ld12 = load i64, ptr %gep.11 193 %add.12 = add nuw nsw i64 %add.11, %ld12 194 %gep.12 = getelementptr inbounds i64, ptr %ptr, i64 26 195 %ld13 = load i64, ptr %gep.12 196 %add.13 = add nuw nsw i64 %add.12, %ld13 197 %gep.13 = getelementptr inbounds i64, ptr %ptr, i64 28 198 %ld14 = load i64, ptr %gep.13 199 %add.14 = add nuw nsw i64 %add.13, %ld14 200 %gep.14 = getelementptr inbounds i64, ptr %ptr, i64 30 201 %ld15 = load i64, ptr %gep.14 202 %add.15 = add nuw nsw i64 %add.14, %ld15 203 ret i64 %add.15 204} 205 206; Next batch test differen reductions kinds 207 208%struct.buf = type { [8 x i8] } 209 210define i8 @reduce_and(ptr %a, ptr %b) { 211; CHECK-LABEL: @reduce_and( 212; CHECK-NEXT: entry: 213; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 214; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 215; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 216; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 217; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP1]], [[TMP0]] 218; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP2]]) 219; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 [[TMP3]], 1 220; CHECK-NEXT: ret i8 [[OP_RDX]] 221; 222entry: 223 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 224 %0 = load i8, ptr %arrayidx, align 1 225 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 226 %1 = load i8, ptr %arrayidx3, align 1 227 %xor12 = xor i8 %1, %0 228 %and13 = and i8 %xor12, 1 229 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 230 %2 = load i8, ptr %arrayidx.1, align 1 231 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 232 %3 = load i8, ptr %arrayidx3.1, align 1 233 %xor12.1 = xor i8 %3, %2 234 %and13.1 = and i8 %xor12.1, %and13 235 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 236 %4 = load i8, ptr %arrayidx.2, align 1 237 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 238 %5 = load i8, ptr %arrayidx3.2, align 1 239 %xor12.2 = xor i8 %5, %4 240 %and13.2 = and i8 %xor12.2, %and13.1 241 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 242 %6 = load i8, ptr %arrayidx.3, align 1 243 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 244 %7 = load i8, ptr %arrayidx3.3, align 1 245 %xor12.3 = xor i8 %7, %6 246 %and13.3 = and i8 %xor12.3, %and13.2 247 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 248 %8 = load i8, ptr %arrayidx.4, align 1 249 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 250 %9 = load i8, ptr %arrayidx3.4, align 1 251 %xor12.4 = xor i8 %9, %8 252 %and13.4 = and i8 %xor12.4, %and13.3 253 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 254 %10 = load i8, ptr %arrayidx.5, align 1 255 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 256 %11 = load i8, ptr %arrayidx3.5, align 1 257 %xor12.5 = xor i8 %11, %10 258 %and13.5 = and i8 %xor12.5, %and13.4 259 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 260 %12 = load i8, ptr %arrayidx.6, align 1 261 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 262 %13 = load i8, ptr %arrayidx3.6, align 1 263 %xor12.6 = xor i8 %13, %12 264 %and13.6 = and i8 %xor12.6, %and13.5 265 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 266 %14 = load i8, ptr %arrayidx.7, align 1 267 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 268 %15 = load i8, ptr %arrayidx3.7, align 1 269 %xor12.7 = xor i8 %15, %14 270 %and13.7 = and i8 %xor12.7, %and13.6 271 ret i8 %and13.7 272} 273 274define i8 @reduce_or_1(ptr %a, ptr %b) { 275; CHECK-LABEL: @reduce_or_1( 276; CHECK-NEXT: entry: 277; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 278; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 279; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 280; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 281; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP1]], [[TMP0]] 282; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP2]]) 283; CHECK-NEXT: ret i8 [[TMP3]] 284; 285 286entry: 287 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 288 %0 = load i8, ptr %arrayidx, align 1 289 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 290 %1 = load i8, ptr %arrayidx3, align 1 291 %xor12 = xor i8 %1, %0 292 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 293 %2 = load i8, ptr %arrayidx.1, align 1 294 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 295 %3 = load i8, ptr %arrayidx3.1, align 1 296 %xor12.1 = xor i8 %3, %2 297 %or13.1 = or i8 %xor12.1, %xor12 298 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 299 %4 = load i8, ptr %arrayidx.2, align 1 300 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 301 %5 = load i8, ptr %arrayidx3.2, align 1 302 %xor12.2 = xor i8 %5, %4 303 %or13.2 = or i8 %xor12.2, %or13.1 304 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 305 %6 = load i8, ptr %arrayidx.3, align 1 306 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 307 %7 = load i8, ptr %arrayidx3.3, align 1 308 %xor12.3 = xor i8 %7, %6 309 %or13.3 = or i8 %xor12.3, %or13.2 310 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 311 %8 = load i8, ptr %arrayidx.4, align 1 312 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 313 %9 = load i8, ptr %arrayidx3.4, align 1 314 %xor12.4 = xor i8 %9, %8 315 %or13.4 = or i8 %xor12.4, %or13.3 316 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 317 %10 = load i8, ptr %arrayidx.5, align 1 318 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 319 %11 = load i8, ptr %arrayidx3.5, align 1 320 %xor12.5 = xor i8 %11, %10 321 %or13.5 = or i8 %xor12.5, %or13.4 322 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 323 %12 = load i8, ptr %arrayidx.6, align 1 324 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 325 %13 = load i8, ptr %arrayidx3.6, align 1 326 %xor12.6 = xor i8 %13, %12 327 %or13.6 = or i8 %xor12.6, %or13.5 328 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 329 %14 = load i8, ptr %arrayidx.7, align 1 330 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 331 %15 = load i8, ptr %arrayidx3.7, align 1 332 %xor12.7 = xor i8 %15, %14 333 %or13.7 = or i8 %xor12.7, %or13.6 334 ret i8 %or13.7 335} 336 337define void @reduce_or_2() { 338; ZVFHMIN-LABEL: @reduce_or_2( 339; ZVFHMIN-NEXT: [[TMP1:%.*]] = shl i64 0, 0 340; ZVFHMIN-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15 341; ZVFHMIN-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer 342; ZVFHMIN-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6 343; ZVFHMIN-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer 344; ZVFHMIN-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) 345; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) 346; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] 347; ZVFHMIN-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] 348; ZVFHMIN: 8: 349; ZVFHMIN-NEXT: ret void 350; ZVFHMIN: 9: 351; ZVFHMIN-NEXT: ret void 352; 353; ZVL128-LABEL: @reduce_or_2( 354; ZVL128-NEXT: [[TMP1:%.*]] = shl i64 0, 0 355; ZVL128-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15 356; ZVL128-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer 357; ZVL128-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6 358; ZVL128-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer 359; ZVL128-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) 360; ZVL128-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) 361; ZVL128-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] 362; ZVL128-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] 363; ZVL128: 8: 364; ZVL128-NEXT: ret void 365; ZVL128: 9: 366; ZVL128-NEXT: ret void 367; 368; ZVL256-LABEL: @reduce_or_2( 369; ZVL256-NEXT: [[TMP1:%.*]] = shl i64 0, 0 370; ZVL256-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15 371; ZVL256-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer 372; ZVL256-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6 373; ZVL256-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer 374; ZVL256-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) 375; ZVL256-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) 376; ZVL256-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] 377; ZVL256-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] 378; ZVL256: 8: 379; ZVL256-NEXT: ret void 380; ZVL256: 9: 381; ZVL256-NEXT: ret void 382; 383; ZVL512-LABEL: @reduce_or_2( 384; ZVL512-NEXT: [[TMP1:%.*]] = shl i64 0, 0 385; ZVL512-NEXT: [[TMP2:%.*]] = insertelement <32 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 15 386; ZVL512-NEXT: [[TMP3:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 15, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 387; ZVL512-NEXT: [[TMP4:%.*]] = icmp ult <32 x i64> [[TMP3]], zeroinitializer 388; ZVL512-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> [[TMP4]]) 389; ZVL512-NEXT: br i1 [[TMP5]], label [[TMP7:%.*]], label [[TMP6:%.*]] 390; ZVL512: 6: 391; ZVL512-NEXT: ret void 392; ZVL512: 7: 393; ZVL512-NEXT: ret void 394; 395 %1 = shl i64 0, 0 396 %2 = icmp ult i64 0, 0 397 %3 = icmp ult i64 0, 0 398 %4 = or i1 %2, %3 399 %5 = icmp ult i64 0, 0 400 %6 = or i1 %4, %5 401 %7 = icmp ult i64 0, 0 402 %8 = or i1 %6, %7 403 %9 = icmp ult i64 0, 0 404 %10 = or i1 %8, %9 405 %11 = icmp ult i64 0, 0 406 %12 = or i1 %10, %11 407 %13 = icmp ult i64 0, 0 408 %14 = or i1 %12, %13 409 %15 = icmp ult i64 0, 0 410 %16 = or i1 %14, %15 411 %17 = icmp ult i64 0, 0 412 %18 = or i1 %16, %17 413 %19 = icmp ult i64 0, 0 414 %20 = or i1 %18, %19 415 %21 = icmp ult i64 0, 0 416 %22 = or i1 %20, %21 417 %23 = icmp ult i64 0, 0 418 %24 = or i1 %22, %23 419 %25 = icmp ult i64 0, 0 420 %26 = or i1 %24, %25 421 %27 = icmp ult i64 0, 0 422 %28 = or i1 %26, %27 423 %29 = icmp ult i64 0, 0 424 %30 = or i1 %28, %29 425 %31 = icmp ult i64 %1, 0 426 %32 = or i1 %30, %31 427 %33 = icmp ult i64 0, 0 428 %34 = or i1 %32, %33 429 %35 = icmp ult i64 0, 0 430 %36 = or i1 %34, %35 431 %37 = icmp ult i64 0, 0 432 %38 = or i1 %36, %37 433 %39 = icmp ult i64 0, 0 434 %40 = or i1 %38, %39 435 %41 = icmp ult i64 0, 0 436 %42 = or i1 %40, %41 437 %43 = icmp ult i64 0, 0 438 %44 = or i1 %42, %43 439 %45 = icmp ult i64 %1, 0 440 %46 = or i1 %44, %45 441 %47 = icmp ult i64 0, 0 442 %48 = or i1 %46, %47 443 %49 = icmp ult i64 0, 0 444 %50 = or i1 %48, %49 445 %51 = icmp ult i64 0, 0 446 %52 = or i1 %50, %51 447 %53 = icmp ult i64 0, 0 448 %54 = or i1 %52, %53 449 %55 = icmp ult i64 0, 0 450 %56 = or i1 %54, %55 451 %57 = icmp ult i64 0, 0 452 %58 = or i1 %56, %57 453 %59 = icmp ult i64 0, 0 454 %60 = or i1 %58, %59 455 %61 = icmp ult i64 0, 0 456 %62 = or i1 %60, %61 457 %63 = icmp ult i64 0, 0 458 %64 = or i1 %62, %63 459 br i1 %64, label %66, label %65 460 46165: ; preds = %0 462 ret void 463 46466: ; preds = %0 465 ret void 466} 467 468define i8 @reduce_xor(ptr %a, ptr %b) { 469; CHECK-LABEL: @reduce_xor( 470; CHECK-NEXT: entry: 471; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 472; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 473; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 474; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 475; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]] 476; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP2]]) 477; CHECK-NEXT: [[OP_RDX:%.*]] = xor i8 [[TMP3]], 1 478; CHECK-NEXT: ret i8 [[OP_RDX]] 479; 480entry: 481 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 482 %0 = load i8, ptr %arrayidx, align 1 483 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 484 %1 = load i8, ptr %arrayidx3, align 1 485 %and12 = and i8 %1, %0 486 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 487 %2 = load i8, ptr %arrayidx.1, align 1 488 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 489 %3 = load i8, ptr %arrayidx3.1, align 1 490 %and12.1 = and i8 %3, %2 491 %4 = xor i8 %and12, %and12.1 492 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 493 %5 = load i8, ptr %arrayidx.2, align 1 494 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 495 %6 = load i8, ptr %arrayidx3.2, align 1 496 %and12.2 = and i8 %6, %5 497 %7 = xor i8 %4, %and12.2 498 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 499 %8 = load i8, ptr %arrayidx.3, align 1 500 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 501 %9 = load i8, ptr %arrayidx3.3, align 1 502 %and12.3 = and i8 %9, %8 503 %10 = xor i8 %7, %and12.3 504 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 505 %11 = load i8, ptr %arrayidx.4, align 1 506 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 507 %12 = load i8, ptr %arrayidx3.4, align 1 508 %and12.4 = and i8 %12, %11 509 %13 = xor i8 %10, %and12.4 510 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 511 %14 = load i8, ptr %arrayidx.5, align 1 512 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 513 %15 = load i8, ptr %arrayidx3.5, align 1 514 %and12.5 = and i8 %15, %14 515 %16 = xor i8 %13, %and12.5 516 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 517 %17 = load i8, ptr %arrayidx.6, align 1 518 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 519 %18 = load i8, ptr %arrayidx3.6, align 1 520 %and12.6 = and i8 %18, %17 521 %19 = xor i8 %16, %and12.6 522 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 523 %20 = load i8, ptr %arrayidx.7, align 1 524 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 525 %21 = load i8, ptr %arrayidx3.7, align 1 526 %and12.7 = and i8 %21, %20 527 %22 = xor i8 %19, %and12.7 528 %xor13.7 = xor i8 %22, 1 529 ret i8 %xor13.7 530} 531 532 533 534define i8 @reduce_add(ptr %a, ptr %b) { 535; CHECK-LABEL: @reduce_add( 536; CHECK-NEXT: entry: 537; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 538; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 539; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 540; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 541; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]] 542; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[TMP2]]) 543; CHECK-NEXT: [[OP_RDX:%.*]] = add i8 [[TMP3]], 1 544; CHECK-NEXT: ret i8 [[OP_RDX]] 545; 546entry: 547 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 548 %0 = load i8, ptr %arrayidx, align 1 549 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 550 %1 = load i8, ptr %arrayidx3, align 1 551 %and12 = and i8 %1, %0 552 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 553 %2 = load i8, ptr %arrayidx.1, align 1 554 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 555 %3 = load i8, ptr %arrayidx3.1, align 1 556 %and12.1 = and i8 %3, %2 557 %4 = add i8 %and12, %and12.1 558 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 559 %5 = load i8, ptr %arrayidx.2, align 1 560 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 561 %6 = load i8, ptr %arrayidx3.2, align 1 562 %and12.2 = and i8 %6, %5 563 %7 = add i8 %4, %and12.2 564 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 565 %8 = load i8, ptr %arrayidx.3, align 1 566 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 567 %9 = load i8, ptr %arrayidx3.3, align 1 568 %and12.3 = and i8 %9, %8 569 %10 = add i8 %7, %and12.3 570 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 571 %11 = load i8, ptr %arrayidx.4, align 1 572 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 573 %12 = load i8, ptr %arrayidx3.4, align 1 574 %and12.4 = and i8 %12, %11 575 %13 = add i8 %10, %and12.4 576 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 577 %14 = load i8, ptr %arrayidx.5, align 1 578 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 579 %15 = load i8, ptr %arrayidx3.5, align 1 580 %and12.5 = and i8 %15, %14 581 %16 = add i8 %13, %and12.5 582 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 583 %17 = load i8, ptr %arrayidx.6, align 1 584 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 585 %18 = load i8, ptr %arrayidx3.6, align 1 586 %and12.6 = and i8 %18, %17 587 %19 = add i8 %16, %and12.6 588 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 589 %20 = load i8, ptr %arrayidx.7, align 1 590 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 591 %21 = load i8, ptr %arrayidx3.7, align 1 592 %and12.7 = and i8 %21, %20 593 %22 = add i8 %19, %and12.7 594 %add13.7 = add i8 %22, 1 595 ret i8 %add13.7 596} 597 598declare i8 @llvm.smin.i8(i8, i8) 599 600define i8 @reduce_smin(ptr %a, ptr %b) { 601; CHECK-LABEL: @reduce_smin( 602; CHECK-NEXT: entry: 603; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 604; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 605; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 606; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 607; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]] 608; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> [[TMP2]]) 609; CHECK-NEXT: ret i8 [[TMP3]] 610; 611entry: 612 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 613 %0 = load i8, ptr %arrayidx, align 1 614 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 615 %1 = load i8, ptr %arrayidx3, align 1 616 %and12 = and i8 %1, %0 617 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 618 %2 = load i8, ptr %arrayidx.1, align 1 619 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 620 %3 = load i8, ptr %arrayidx3.1, align 1 621 %and12.1 = and i8 %3, %2 622 %4 = tail call i8 @llvm.smin.i8(i8 %and12, i8 %and12.1) 623 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 624 %5 = load i8, ptr %arrayidx.2, align 1 625 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 626 %6 = load i8, ptr %arrayidx3.2, align 1 627 %and12.2 = and i8 %6, %5 628 %7 = tail call i8 @llvm.smin.i8(i8 %4, i8 %and12.2) 629 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 630 %8 = load i8, ptr %arrayidx.3, align 1 631 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 632 %9 = load i8, ptr %arrayidx3.3, align 1 633 %and12.3 = and i8 %9, %8 634 %10 = tail call i8 @llvm.smin.i8(i8 %7, i8 %and12.3) 635 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 636 %11 = load i8, ptr %arrayidx.4, align 1 637 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 638 %12 = load i8, ptr %arrayidx3.4, align 1 639 %and12.4 = and i8 %12, %11 640 %13 = tail call i8 @llvm.smin.i8(i8 %10, i8 %and12.4) 641 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 642 %14 = load i8, ptr %arrayidx.5, align 1 643 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 644 %15 = load i8, ptr %arrayidx3.5, align 1 645 %and12.5 = and i8 %15, %14 646 %16 = tail call i8 @llvm.smin.i8(i8 %13, i8 %and12.5) 647 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 648 %17 = load i8, ptr %arrayidx.6, align 1 649 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 650 %18 = load i8, ptr %arrayidx3.6, align 1 651 %and12.6 = and i8 %18, %17 652 %19 = tail call i8 @llvm.smin.i8(i8 %16, i8 %and12.6) 653 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 654 %20 = load i8, ptr %arrayidx.7, align 1 655 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 656 %21 = load i8, ptr %arrayidx3.7, align 1 657 %and12.7 = and i8 %21, %20 658 %22 = tail call i8 @llvm.smin.i8(i8 %19, i8 %and12.7) 659 ret i8 %22 660} 661 662declare i8 @llvm.smax.i8(i8, i8) 663 664define i8 @reduce_smax(ptr %a, ptr %b) { 665; CHECK-LABEL: @reduce_smax( 666; CHECK-NEXT: entry: 667; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 668; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 669; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 670; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 671; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]] 672; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP2]]) 673; CHECK-NEXT: ret i8 [[TMP3]] 674; 675entry: 676 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 677 %0 = load i8, ptr %arrayidx, align 1 678 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 679 %1 = load i8, ptr %arrayidx3, align 1 680 %and12 = and i8 %1, %0 681 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 682 %2 = load i8, ptr %arrayidx.1, align 1 683 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 684 %3 = load i8, ptr %arrayidx3.1, align 1 685 %and12.1 = and i8 %3, %2 686 %4 = tail call i8 @llvm.smax.i8(i8 %and12, i8 %and12.1) 687 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 688 %5 = load i8, ptr %arrayidx.2, align 1 689 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 690 %6 = load i8, ptr %arrayidx3.2, align 1 691 %and12.2 = and i8 %6, %5 692 %7 = tail call i8 @llvm.smax.i8(i8 %4, i8 %and12.2) 693 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 694 %8 = load i8, ptr %arrayidx.3, align 1 695 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 696 %9 = load i8, ptr %arrayidx3.3, align 1 697 %and12.3 = and i8 %9, %8 698 %10 = tail call i8 @llvm.smax.i8(i8 %7, i8 %and12.3) 699 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 700 %11 = load i8, ptr %arrayidx.4, align 1 701 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 702 %12 = load i8, ptr %arrayidx3.4, align 1 703 %and12.4 = and i8 %12, %11 704 %13 = tail call i8 @llvm.smax.i8(i8 %10, i8 %and12.4) 705 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 706 %14 = load i8, ptr %arrayidx.5, align 1 707 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 708 %15 = load i8, ptr %arrayidx3.5, align 1 709 %and12.5 = and i8 %15, %14 710 %16 = tail call i8 @llvm.smax.i8(i8 %13, i8 %and12.5) 711 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 712 %17 = load i8, ptr %arrayidx.6, align 1 713 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 714 %18 = load i8, ptr %arrayidx3.6, align 1 715 %and12.6 = and i8 %18, %17 716 %19 = tail call i8 @llvm.smax.i8(i8 %16, i8 %and12.6) 717 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 718 %20 = load i8, ptr %arrayidx.7, align 1 719 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 720 %21 = load i8, ptr %arrayidx3.7, align 1 721 %and12.7 = and i8 %21, %20 722 %22 = tail call i8 @llvm.smax.i8(i8 %19, i8 %and12.7) 723 ret i8 %22 724} 725 726declare i8 @llvm.umax.i8(i8, i8) 727 728define i8 @reduce_umax(ptr %a, ptr %b) { 729; CHECK-LABEL: @reduce_umax( 730; CHECK-NEXT: entry: 731; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 732; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 733; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 734; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 735; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]] 736; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> [[TMP2]]) 737; CHECK-NEXT: ret i8 [[TMP3]] 738; 739entry: 740 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 741 %0 = load i8, ptr %arrayidx, align 1 742 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 743 %1 = load i8, ptr %arrayidx3, align 1 744 %and12 = and i8 %1, %0 745 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 746 %2 = load i8, ptr %arrayidx.1, align 1 747 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 748 %3 = load i8, ptr %arrayidx3.1, align 1 749 %and12.1 = and i8 %3, %2 750 %4 = tail call i8 @llvm.umax.i8(i8 %and12, i8 %and12.1) 751 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 752 %5 = load i8, ptr %arrayidx.2, align 1 753 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 754 %6 = load i8, ptr %arrayidx3.2, align 1 755 %and12.2 = and i8 %6, %5 756 %7 = tail call i8 @llvm.umax.i8(i8 %4, i8 %and12.2) 757 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 758 %8 = load i8, ptr %arrayidx.3, align 1 759 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 760 %9 = load i8, ptr %arrayidx3.3, align 1 761 %and12.3 = and i8 %9, %8 762 %10 = tail call i8 @llvm.umax.i8(i8 %7, i8 %and12.3) 763 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 764 %11 = load i8, ptr %arrayidx.4, align 1 765 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 766 %12 = load i8, ptr %arrayidx3.4, align 1 767 %and12.4 = and i8 %12, %11 768 %13 = tail call i8 @llvm.umax.i8(i8 %10, i8 %and12.4) 769 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 770 %14 = load i8, ptr %arrayidx.5, align 1 771 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 772 %15 = load i8, ptr %arrayidx3.5, align 1 773 %and12.5 = and i8 %15, %14 774 %16 = tail call i8 @llvm.umax.i8(i8 %13, i8 %and12.5) 775 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 776 %17 = load i8, ptr %arrayidx.6, align 1 777 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 778 %18 = load i8, ptr %arrayidx3.6, align 1 779 %and12.6 = and i8 %18, %17 780 %19 = tail call i8 @llvm.umax.i8(i8 %16, i8 %and12.6) 781 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 782 %20 = load i8, ptr %arrayidx.7, align 1 783 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 784 %21 = load i8, ptr %arrayidx3.7, align 1 785 %and12.7 = and i8 %21, %20 786 %22 = tail call i8 @llvm.umax.i8(i8 %19, i8 %and12.7) 787 ret i8 %22 788} 789 790declare i8 @llvm.umin.i8(i8, i8) 791 792define i8 @reduce_umin(ptr %a, ptr %b) { 793; CHECK-LABEL: @reduce_umin( 794; CHECK-NEXT: entry: 795; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0 796; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0 797; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1 798; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1 799; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]] 800; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> [[TMP2]]) 801; CHECK-NEXT: ret i8 [[TMP3]] 802; 803entry: 804 %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0 805 %0 = load i8, ptr %arrayidx, align 1 806 %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0 807 %1 = load i8, ptr %arrayidx3, align 1 808 %and12 = and i8 %1, %0 809 %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1 810 %2 = load i8, ptr %arrayidx.1, align 1 811 %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1 812 %3 = load i8, ptr %arrayidx3.1, align 1 813 %and12.1 = and i8 %3, %2 814 %4 = tail call i8 @llvm.umin.i8(i8 %and12, i8 %and12.1) 815 %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2 816 %5 = load i8, ptr %arrayidx.2, align 1 817 %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2 818 %6 = load i8, ptr %arrayidx3.2, align 1 819 %and12.2 = and i8 %6, %5 820 %7 = tail call i8 @llvm.umin.i8(i8 %4, i8 %and12.2) 821 %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3 822 %8 = load i8, ptr %arrayidx.3, align 1 823 %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3 824 %9 = load i8, ptr %arrayidx3.3, align 1 825 %and12.3 = and i8 %9, %8 826 %10 = tail call i8 @llvm.umin.i8(i8 %7, i8 %and12.3) 827 %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4 828 %11 = load i8, ptr %arrayidx.4, align 1 829 %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4 830 %12 = load i8, ptr %arrayidx3.4, align 1 831 %and12.4 = and i8 %12, %11 832 %13 = tail call i8 @llvm.umin.i8(i8 %10, i8 %and12.4) 833 %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5 834 %14 = load i8, ptr %arrayidx.5, align 1 835 %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5 836 %15 = load i8, ptr %arrayidx3.5, align 1 837 %and12.5 = and i8 %15, %14 838 %16 = tail call i8 @llvm.umin.i8(i8 %13, i8 %and12.5) 839 %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6 840 %17 = load i8, ptr %arrayidx.6, align 1 841 %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6 842 %18 = load i8, ptr %arrayidx3.6, align 1 843 %and12.6 = and i8 %18, %17 844 %19 = tail call i8 @llvm.umin.i8(i8 %16, i8 %and12.6) 845 %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7 846 %20 = load i8, ptr %arrayidx.7, align 1 847 %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7 848 %21 = load i8, ptr %arrayidx3.7, align 1 849 %and12.7 = and i8 %21, %20 850 %22 = tail call i8 @llvm.umin.i8(i8 %19, i8 %and12.7) 851 ret i8 %22 852} 853 854; Next batch exercise reductions involing zext of narrower loads 855 856define i64 @red_zext_ld_2xi64(ptr %ptr) { 857; CHECK-LABEL: @red_zext_ld_2xi64( 858; CHECK-NEXT: entry: 859; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1 860; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD0]] to i64 861; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1 862; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP]], align 1 863; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64 864; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]] 865; CHECK-NEXT: ret i64 [[ADD_1]] 866; 867entry: 868 %ld0 = load i8, ptr %ptr 869 %zext = zext i8 %ld0 to i64 870 %gep = getelementptr inbounds i8, ptr %ptr, i64 1 871 %ld1 = load i8, ptr %gep 872 %zext.1 = zext i8 %ld1 to i64 873 %add.1 = add nuw nsw i64 %zext, %zext.1 874 ret i64 %add.1 875} 876 877define i64 @red_zext_ld_4xi64(ptr %ptr) { 878; CHECK-LABEL: @red_zext_ld_4xi64( 879; CHECK-NEXT: entry: 880; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1 881; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16> 882; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]]) 883; CHECK-NEXT: [[ADD_3:%.*]] = zext i16 [[TMP2]] to i64 884; CHECK-NEXT: ret i64 [[ADD_3]] 885; 886entry: 887 %ld0 = load i8, ptr %ptr 888 %zext = zext i8 %ld0 to i64 889 %gep = getelementptr inbounds i8, ptr %ptr, i64 1 890 %ld1 = load i8, ptr %gep 891 %zext.1 = zext i8 %ld1 to i64 892 %add.1 = add nuw nsw i64 %zext, %zext.1 893 %gep.1 = getelementptr inbounds i8, ptr %ptr, i64 2 894 %ld2 = load i8, ptr %gep.1 895 %zext.2 = zext i8 %ld2 to i64 896 %add.2 = add nuw nsw i64 %add.1, %zext.2 897 %gep.2 = getelementptr inbounds i8, ptr %ptr, i64 3 898 %ld3 = load i8, ptr %gep.2 899 %zext.3 = zext i8 %ld3 to i64 900 %add.3 = add nuw nsw i64 %add.2, %zext.3 901 ret i64 %add.3 902} 903 904define i64 @red_zext_ld_8xi64(ptr %ptr) { 905; CHECK-LABEL: @red_zext_ld_8xi64( 906; CHECK-NEXT: entry: 907; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 908; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i64> 909; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]]) 910; CHECK-NEXT: ret i64 [[TMP2]] 911; 912entry: 913 %ld0 = load i8, ptr %ptr 914 %zext = zext i8 %ld0 to i64 915 %gep = getelementptr inbounds i8, ptr %ptr, i64 1 916 %ld1 = load i8, ptr %gep 917 %zext.1 = zext i8 %ld1 to i64 918 %add.1 = add nuw nsw i64 %zext, %zext.1 919 %gep.1 = getelementptr inbounds i8, ptr %ptr, i64 2 920 %ld2 = load i8, ptr %gep.1 921 %zext.2 = zext i8 %ld2 to i64 922 %add.2 = add nuw nsw i64 %add.1, %zext.2 923 %gep.2 = getelementptr inbounds i8, ptr %ptr, i64 3 924 %ld3 = load i8, ptr %gep.2 925 %zext.3 = zext i8 %ld3 to i64 926 %add.3 = add nuw nsw i64 %add.2, %zext.3 927 %gep.3 = getelementptr inbounds i8, ptr %ptr, i64 4 928 %ld4 = load i8, ptr %gep.3 929 %zext.4 = zext i8 %ld4 to i64 930 %add.4 = add nuw nsw i64 %add.3, %zext.4 931 %gep.4 = getelementptr inbounds i8, ptr %ptr, i64 5 932 %ld5 = load i8, ptr %gep.4 933 %zext.5 = zext i8 %ld5 to i64 934 %add.5 = add nuw nsw i64 %add.4, %zext.5 935 %gep.5 = getelementptr inbounds i8, ptr %ptr, i64 6 936 %ld6 = load i8, ptr %gep.5 937 %zext.6 = zext i8 %ld6 to i64 938 %add.6 = add nuw nsw i64 %add.5, %zext.6 939 %gep.6 = getelementptr inbounds i8, ptr %ptr, i64 7 940 %ld7 = load i8, ptr %gep.6 941 %zext.7 = zext i8 %ld7 to i64 942 %add.7 = add nuw nsw i64 %add.6, %zext.7 943 ret i64 %add.7 944} 945 946define i64 @red_zext_ld_16xi64(ptr %ptr) { 947; CHECK-LABEL: @red_zext_ld_16xi64( 948; CHECK-NEXT: entry: 949; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[PTR:%.*]], align 1 950; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i64> 951; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]]) 952; CHECK-NEXT: ret i64 [[TMP2]] 953; 954entry: 955 %ld0 = load i8, ptr %ptr 956 %zext = zext i8 %ld0 to i64 957 %gep = getelementptr inbounds i8, ptr %ptr, i64 1 958 %ld1 = load i8, ptr %gep 959 %zext.1 = zext i8 %ld1 to i64 960 %add.1 = add nuw nsw i64 %zext, %zext.1 961 %gep.1 = getelementptr inbounds i8, ptr %ptr, i64 2 962 %ld2 = load i8, ptr %gep.1 963 %zext.2 = zext i8 %ld2 to i64 964 %add.2 = add nuw nsw i64 %add.1, %zext.2 965 %gep.2 = getelementptr inbounds i8, ptr %ptr, i64 3 966 %ld3 = load i8, ptr %gep.2 967 %zext.3 = zext i8 %ld3 to i64 968 %add.3 = add nuw nsw i64 %add.2, %zext.3 969 %gep.3 = getelementptr inbounds i8, ptr %ptr, i64 4 970 %ld4 = load i8, ptr %gep.3 971 %zext.4 = zext i8 %ld4 to i64 972 %add.4 = add nuw nsw i64 %add.3, %zext.4 973 %gep.4 = getelementptr inbounds i8, ptr %ptr, i64 5 974 %ld5 = load i8, ptr %gep.4 975 %zext.5 = zext i8 %ld5 to i64 976 %add.5 = add nuw nsw i64 %add.4, %zext.5 977 %gep.5 = getelementptr inbounds i8, ptr %ptr, i64 6 978 %ld6 = load i8, ptr %gep.5 979 %zext.6 = zext i8 %ld6 to i64 980 %add.6 = add nuw nsw i64 %add.5, %zext.6 981 %gep.6 = getelementptr inbounds i8, ptr %ptr, i64 7 982 %ld7 = load i8, ptr %gep.6 983 %zext.7 = zext i8 %ld7 to i64 984 %add.7 = add nuw nsw i64 %add.6, %zext.7 985 %gep.7 = getelementptr inbounds i8, ptr %ptr, i64 8 986 %ld8 = load i8, ptr %gep.7 987 %zext.8 = zext i8 %ld8 to i64 988 %add.8 = add nuw nsw i64 %add.7, %zext.8 989 %gep.8 = getelementptr inbounds i8, ptr %ptr, i64 9 990 %ld9 = load i8, ptr %gep.8 991 %zext.9 = zext i8 %ld9 to i64 992 %add.9 = add nuw nsw i64 %add.8, %zext.9 993 %gep.9 = getelementptr inbounds i8, ptr %ptr, i64 10 994 %ld10 = load i8, ptr %gep.9 995 %zext.10 = zext i8 %ld10 to i64 996 %add.10 = add nuw nsw i64 %add.9, %zext.10 997 %gep.10 = getelementptr inbounds i8, ptr %ptr, i64 11 998 %ld11 = load i8, ptr %gep.10 999 %zext.11 = zext i8 %ld11 to i64 1000 %add.11 = add nuw nsw i64 %add.10, %zext.11 1001 %gep.11 = getelementptr inbounds i8, ptr %ptr, i64 12 1002 %ld12 = load i8, ptr %gep.11 1003 %zext.12 = zext i8 %ld12 to i64 1004 %add.12 = add nuw nsw i64 %add.11, %zext.12 1005 %gep.12 = getelementptr inbounds i8, ptr %ptr, i64 13 1006 %ld13 = load i8, ptr %gep.12 1007 %zext.13 = zext i8 %ld13 to i64 1008 %add.13 = add nuw nsw i64 %add.12, %zext.13 1009 %gep.13 = getelementptr inbounds i8, ptr %ptr, i64 14 1010 %ld14 = load i8, ptr %gep.13 1011 %zext.14 = zext i8 %ld14 to i64 1012 %add.14 = add nuw nsw i64 %add.13, %zext.14 1013 %gep.14 = getelementptr inbounds i8, ptr %ptr, i64 15 1014 %ld15 = load i8, ptr %gep.14 1015 %zext.15 = zext i8 %ld15 to i64 1016 %add.15 = add nuw nsw i64 %add.14, %zext.15 1017 ret i64 %add.15 1018} 1019 1020declare i32 @llvm.abs.i32(i32, i1) 1021 1022define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { 1023; CHECK-LABEL: @stride_sum_abs_diff( 1024; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] 1025; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] 1026; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 1027; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 1028; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 1029; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 1030; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) 1031; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) 1032; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) 1033; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) 1034; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] 1035; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) 1036; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) 1037; CHECK-NEXT: ret i32 [[TMP11]] 1038; 1039 %x.0 = load i32, ptr %p 1040 %y.0 = load i32, ptr %q 1041 %sub.0 = sub i32 %x.0, %y.0 1042 %abs.0 = tail call i32 @llvm.abs.i32(i32 %sub.0, i1 true) 1043 1044 %p.1 = getelementptr inbounds i32, ptr %p, i64 1 1045 %x.1 = load i32, ptr %p.1 1046 %q.1 = getelementptr inbounds i32, ptr %q, i64 1 1047 %y.1 = load i32, ptr %q.1 1048 %sub.1 = sub i32 %x.1, %y.1 1049 %abs.1 = tail call i32 @llvm.abs.i32(i32 %sub.1, i1 true) 1050 %sum.0 = add i32 %abs.0, %abs.1 1051 1052 %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride 1053 %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride 1054 1055 %x.2 = load i32, ptr %p.2 1056 %y.2 = load i32, ptr %q.2 1057 %sub.2 = sub i32 %x.2, %y.2 1058 %abs.2 = tail call i32 @llvm.abs.i32(i32 %sub.2, i1 true) 1059 %sum.1 = add i32 %sum.0, %abs.2 1060 1061 %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1 1062 %x.3 = load i32, ptr %p.3 1063 %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1 1064 %y.3 = load i32, ptr %q.3 1065 %sub.3 = sub i32 %x.3, %y.3 1066 %abs.3 = tail call i32 @llvm.abs.i32(i32 %sub.3, i1 true) 1067 %sum.2 = add i32 %sum.1, %abs.3 1068 1069 ret i32 %sum.2 1070} 1071 1072define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { 1073; CHECK-LABEL: @reduce_sum_2arrays_a( 1074; CHECK-NEXT: entry: 1075; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 1076; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 1077; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) 1078; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) 1079; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> 1080; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) 1081; CHECK-NEXT: ret i32 [[TMP5]] 1082; 1083entry: 1084 %x.0 = load i8, ptr %p, align 1 1085 %conv = zext i8 %x.0 to i32 1086 %y.0 = load i8, ptr %q, align 1 1087 %conv3 = zext i8 %y.0 to i32 1088 %add4 = add nuw nsw i32 %conv, %conv3 1089 1090 %arrayidx.1 = getelementptr inbounds i8, ptr %p, i64 1 1091 %x.1 = load i8, ptr %arrayidx.1, align 1 1092 %conv.1 = zext i8 %x.1 to i32 1093 %arrayidx2.1 = getelementptr inbounds i8, ptr %q, i64 1 1094 %y.1 = load i8, ptr %arrayidx2.1, align 1 1095 %conv3.1 = zext i8 %y.1 to i32 1096 %add.1 = add nuw nsw i32 %add4, %conv.1 1097 %add4.1 = add nuw nsw i32 %add.1, %conv3.1 1098 1099 %arrayidx.2 = getelementptr inbounds i8, ptr %p, i64 2 1100 %x.2 = load i8, ptr %arrayidx.2, align 1 1101 %conv.2 = zext i8 %x.2 to i32 1102 %arrayidx2.2 = getelementptr inbounds i8, ptr %q, i64 2 1103 %y.2 = load i8, ptr %arrayidx2.2, align 1 1104 %conv3.2 = zext i8 %y.2 to i32 1105 %add.2 = add nuw nsw i32 %add4.1, %conv.2 1106 %add4.2 = add nuw nsw i32 %add.2, %conv3.2 1107 1108 %arrayidx.3 = getelementptr inbounds i8, ptr %p, i64 3 1109 %x.3 = load i8, ptr %arrayidx.3, align 1 1110 %conv.3 = zext i8 %x.3 to i32 1111 %arrayidx2.3 = getelementptr inbounds i8, ptr %q, i64 3 1112 %y.3 = load i8, ptr %arrayidx2.3, align 1 1113 %conv3.3 = zext i8 %y.3 to i32 1114 %add.3 = add nuw nsw i32 %add4.2, %conv.3 1115 %add4.3 = add nuw nsw i32 %add.3, %conv3.3 1116 1117 ret i32 %add4.3 1118} 1119 1120define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { 1121; CHECK-LABEL: @reduce_sum_2arrays_b( 1122; CHECK-NEXT: entry: 1123; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 1124; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 1125; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) 1126; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) 1127; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> 1128; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) 1129; CHECK-NEXT: ret i32 [[TMP5]] 1130; 1131 entry: 1132 %0 = load i8, ptr %x, align 1 1133 %conv = zext i8 %0 to i32 1134 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i64 1 1135 %1 = load i8, ptr %arrayidx.1, align 1 1136 %conv.1 = zext i8 %1 to i32 1137 %add.1 = add nuw nsw i32 %conv, %conv.1 1138 %arrayidx.2 = getelementptr inbounds i8, ptr %x, i64 2 1139 %2 = load i8, ptr %arrayidx.2, align 1 1140 %conv.2 = zext i8 %2 to i32 1141 %add.2 = add nuw nsw i32 %add.1, %conv.2 1142 %arrayidx.3 = getelementptr inbounds i8, ptr %x, i64 3 1143 %3 = load i8, ptr %arrayidx.3, align 1 1144 %conv.3 = zext i8 %3 to i32 1145 %add.3 = add nuw nsw i32 %add.2, %conv.3 1146 %4 = load i8, ptr %y, align 1 1147 %conv9 = zext i8 %4 to i32 1148 %add10 = add nuw nsw i32 %add.3, %conv9 1149 %arrayidx8.1 = getelementptr inbounds i8, ptr %y, i64 1 1150 %5 = load i8, ptr %arrayidx8.1, align 1 1151 %conv9.1 = zext i8 %5 to i32 1152 %add10.1 = add nuw nsw i32 %add10, %conv9.1 1153 %arrayidx8.2 = getelementptr inbounds i8, ptr %y, i64 2 1154 %6 = load i8, ptr %arrayidx8.2, align 1 1155 %conv9.2 = zext i8 %6 to i32 1156 %add10.2 = add nuw nsw i32 %add10.1, %conv9.2 1157 %arrayidx8.3 = getelementptr inbounds i8, ptr %y, i64 3 1158 %7 = load i8, ptr %arrayidx8.3, align 1 1159 %conv9.3 = zext i8 %7 to i32 1160 %add10.3 = add nuw nsw i32 %add10.2, %conv9.3 1161 ret i32 %add10.3 1162} 1163 1164; Shouldn't vectorize to a reduction because we can't promote it 1165define bfloat @fadd_4xbf16(ptr %p) { 1166; CHECK-LABEL: @fadd_4xbf16( 1167; CHECK-NEXT: [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2 1168; CHECK-NEXT: [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1 1169; CHECK-NEXT: [[X1:%.*]] = load bfloat, ptr [[P1]], align 2 1170; CHECK-NEXT: [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2 1171; CHECK-NEXT: [[X2:%.*]] = load bfloat, ptr [[P2]], align 2 1172; CHECK-NEXT: [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3 1173; CHECK-NEXT: [[X3:%.*]] = load bfloat, ptr [[P3]], align 2 1174; CHECK-NEXT: [[R0:%.*]] = fadd fast bfloat [[X0]], [[X1]] 1175; CHECK-NEXT: [[R1:%.*]] = fadd fast bfloat [[R0]], [[X2]] 1176; CHECK-NEXT: [[R2:%.*]] = fadd fast bfloat [[R1]], [[X3]] 1177; CHECK-NEXT: ret bfloat [[R2]] 1178; 1179 %x0 = load bfloat, ptr %p 1180 %p1 = getelementptr bfloat, ptr %p, i32 1 1181 %x1 = load bfloat, ptr %p1 1182 %p2 = getelementptr bfloat, ptr %p, i32 2 1183 %x2 = load bfloat, ptr %p2 1184 %p3 = getelementptr bfloat, ptr %p, i32 3 1185 %x3 = load bfloat, ptr %p3 1186 1187 %r0 = fadd fast bfloat %x0, %x1 1188 %r1 = fadd fast bfloat %r0, %x2 1189 %r2 = fadd fast bfloat %r1, %x3 1190 1191 ret bfloat %r2 1192} 1193 1194; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs 1195define bfloat @fmul_4xbf16(ptr %p) { 1196; CHECK-LABEL: @fmul_4xbf16( 1197; CHECK-NEXT: [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2 1198; CHECK-NEXT: [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1 1199; CHECK-NEXT: [[X1:%.*]] = load bfloat, ptr [[P1]], align 2 1200; CHECK-NEXT: [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2 1201; CHECK-NEXT: [[X2:%.*]] = load bfloat, ptr [[P2]], align 2 1202; CHECK-NEXT: [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3 1203; CHECK-NEXT: [[X3:%.*]] = load bfloat, ptr [[P3]], align 2 1204; CHECK-NEXT: [[R0:%.*]] = fmul fast bfloat [[X0]], [[X1]] 1205; CHECK-NEXT: [[R1:%.*]] = fmul fast bfloat [[R0]], [[X2]] 1206; CHECK-NEXT: [[R2:%.*]] = fmul fast bfloat [[R1]], [[X3]] 1207; CHECK-NEXT: ret bfloat [[R2]] 1208; 1209 %x0 = load bfloat, ptr %p 1210 %p1 = getelementptr bfloat, ptr %p, i32 1 1211 %x1 = load bfloat, ptr %p1 1212 %p2 = getelementptr bfloat, ptr %p, i32 2 1213 %x2 = load bfloat, ptr %p2 1214 %p3 = getelementptr bfloat, ptr %p, i32 3 1215 %x3 = load bfloat, ptr %p3 1216 1217 %r0 = fmul fast bfloat %x0, %x1 1218 %r1 = fmul fast bfloat %r0, %x2 1219 %r2 = fmul fast bfloat %r1, %x3 1220 1221 ret bfloat %r2 1222} 1223 1224; Shouldn't vectorize to a reduction on zvfhmin because we can't promote it 1225define half @fadd_4xf16(ptr %p) { 1226; ZVFHMIN-LABEL: @fadd_4xf16( 1227; ZVFHMIN-NEXT: [[X0:%.*]] = load half, ptr [[P:%.*]], align 2 1228; ZVFHMIN-NEXT: [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1 1229; ZVFHMIN-NEXT: [[X1:%.*]] = load half, ptr [[P1]], align 2 1230; ZVFHMIN-NEXT: [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2 1231; ZVFHMIN-NEXT: [[X2:%.*]] = load half, ptr [[P2]], align 2 1232; ZVFHMIN-NEXT: [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3 1233; ZVFHMIN-NEXT: [[X3:%.*]] = load half, ptr [[P3]], align 2 1234; ZVFHMIN-NEXT: [[R0:%.*]] = fadd fast half [[X0]], [[X1]] 1235; ZVFHMIN-NEXT: [[R1:%.*]] = fadd fast half [[R0]], [[X2]] 1236; ZVFHMIN-NEXT: [[R2:%.*]] = fadd fast half [[R1]], [[X3]] 1237; ZVFHMIN-NEXT: ret half [[R2]] 1238; 1239; ZVFH-LABEL: @fadd_4xf16( 1240; ZVFH-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[P:%.*]], align 2 1241; ZVFH-NEXT: [[TMP2:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP1]]) 1242; ZVFH-NEXT: ret half [[TMP2]] 1243; 1244 %x0 = load half, ptr %p 1245 %p1 = getelementptr half, ptr %p, i32 1 1246 %x1 = load half, ptr %p1 1247 %p2 = getelementptr half, ptr %p, i32 2 1248 %x2 = load half, ptr %p2 1249 %p3 = getelementptr half, ptr %p, i32 3 1250 %x3 = load half, ptr %p3 1251 1252 %r0 = fadd fast half %x0, %x1 1253 %r1 = fadd fast half %r0, %x2 1254 %r2 = fadd fast half %r1, %x3 1255 1256 ret half %r2 1257} 1258 1259; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs 1260define half @fmul_4xf16(ptr %p) { 1261; CHECK-LABEL: @fmul_4xf16( 1262; CHECK-NEXT: [[X0:%.*]] = load half, ptr [[P:%.*]], align 2 1263; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1 1264; CHECK-NEXT: [[X1:%.*]] = load half, ptr [[P1]], align 2 1265; CHECK-NEXT: [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2 1266; CHECK-NEXT: [[X2:%.*]] = load half, ptr [[P2]], align 2 1267; CHECK-NEXT: [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3 1268; CHECK-NEXT: [[X3:%.*]] = load half, ptr [[P3]], align 2 1269; CHECK-NEXT: [[R0:%.*]] = fmul fast half [[X0]], [[X1]] 1270; CHECK-NEXT: [[R1:%.*]] = fmul fast half [[R0]], [[X2]] 1271; CHECK-NEXT: [[R2:%.*]] = fmul fast half [[R1]], [[X3]] 1272; CHECK-NEXT: ret half [[R2]] 1273; 1274 %x0 = load half, ptr %p 1275 %p1 = getelementptr half, ptr %p, i32 1 1276 %x1 = load half, ptr %p1 1277 %p2 = getelementptr half, ptr %p, i32 2 1278 %x2 = load half, ptr %p2 1279 %p3 = getelementptr half, ptr %p, i32 3 1280 %x3 = load half, ptr %p3 1281 1282 %r0 = fmul fast half %x0, %x1 1283 %r1 = fmul fast half %r0, %x2 1284 %r2 = fmul fast half %r1, %x3 1285 1286 ret half %r2 1287} 1288