1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 3; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 4 5define i32 @reduce_sum_2xi32(<2 x i32> %v) { 6; CHECK-LABEL: reduce_sum_2xi32: 7; CHECK: # %bb.0: 8; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 9; CHECK-NEXT: vmv.s.x v9, zero 10; CHECK-NEXT: vredsum.vs v8, v8, v9 11; CHECK-NEXT: vmv.x.s a0, v8 12; CHECK-NEXT: ret 13 %e0 = extractelement <2 x i32> %v, i32 0 14 %e1 = extractelement <2 x i32> %v, i32 1 15 %add0 = add i32 %e0, %e1 16 ret i32 %add0 17} 18 19define i32 @reduce_sum_4xi32(<4 x i32> %v) { 20; CHECK-LABEL: reduce_sum_4xi32: 21; CHECK: # %bb.0: 22; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 23; CHECK-NEXT: vmv.s.x v9, zero 24; CHECK-NEXT: vredsum.vs v8, v8, v9 25; CHECK-NEXT: vmv.x.s a0, v8 26; CHECK-NEXT: ret 27 %e0 = extractelement <4 x i32> %v, i32 0 28 %e1 = extractelement <4 x i32> %v, i32 1 29 %e2 = extractelement <4 x i32> %v, i32 2 30 %e3 = extractelement <4 x i32> %v, i32 3 31 %add0 = add i32 %e0, %e1 32 %add1 = add i32 %add0, %e2 33 %add2 = add i32 %add1, %e3 34 ret i32 %add2 35} 36 37define i32 @reduce_sum_8xi32(<8 x i32> %v) { 38; CHECK-LABEL: reduce_sum_8xi32: 39; CHECK: # %bb.0: 40; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 41; CHECK-NEXT: vmv.s.x v10, zero 42; CHECK-NEXT: vredsum.vs v8, v8, v10 43; CHECK-NEXT: vmv.x.s a0, v8 44; CHECK-NEXT: ret 45 %e0 = extractelement <8 x i32> %v, i32 0 46 %e1 = extractelement <8 x i32> %v, i32 1 47 %e2 = extractelement <8 x i32> %v, i32 2 48 %e3 = extractelement <8 x i32> %v, i32 3 49 %e4 = extractelement <8 x i32> %v, i32 4 50 %e5 = extractelement <8 x i32> %v, i32 5 51 %e6 = extractelement <8 x i32> %v, i32 6 52 %e7 = extractelement <8 x i32> %v, i32 7 53 %add0 = add i32 %e0, %e1 54 %add1 = add i32 %add0, %e2 55 %add2 = add i32 %add1, %e3 56 %add3 = add i32 %add2, %e4 57 %add4 = add i32 %add3, %e5 58 %add5 = add i32 %add4, %e6 59 %add6 = add i32 %add5, %e7 60 ret i32 %add6 61} 62 63define i32 @reduce_sum_16xi32(<16 x i32> %v) { 64; CHECK-LABEL: reduce_sum_16xi32: 65; CHECK: # %bb.0: 66; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 67; CHECK-NEXT: vmv.s.x v12, zero 68; CHECK-NEXT: vredsum.vs v8, v8, v12 69; CHECK-NEXT: vmv.x.s a0, v8 70; CHECK-NEXT: ret 71 %e0 = extractelement <16 x i32> %v, i32 0 72 %e1 = extractelement <16 x i32> %v, i32 1 73 %e2 = extractelement <16 x i32> %v, i32 2 74 %e3 = extractelement <16 x i32> %v, i32 3 75 %e4 = extractelement <16 x i32> %v, i32 4 76 %e5 = extractelement <16 x i32> %v, i32 5 77 %e6 = extractelement <16 x i32> %v, i32 6 78 %e7 = extractelement <16 x i32> %v, i32 7 79 %e8 = extractelement <16 x i32> %v, i32 8 80 %e9 = extractelement <16 x i32> %v, i32 9 81 %e10 = extractelement <16 x i32> %v, i32 10 82 %e11 = extractelement <16 x i32> %v, i32 11 83 %e12 = extractelement <16 x i32> %v, i32 12 84 %e13 = extractelement <16 x i32> %v, i32 13 85 %e14 = extractelement <16 x i32> %v, i32 14 86 %e15 = extractelement <16 x i32> %v, i32 15 87 %add0 = add i32 %e0, %e1 88 %add1 = add i32 %add0, %e2 89 %add2 = add i32 %add1, %e3 90 %add3 = add i32 %add2, %e4 91 %add4 = add i32 %add3, %e5 92 %add5 = add i32 %add4, %e6 93 %add6 = add i32 %add5, %e7 94 %add7 = add i32 %add6, %e8 95 %add8 = add i32 %add7, %e9 96 %add9 = add i32 %add8, %e10 97 %add10 = add i32 %add9, %e11 98 %add11 = add i32 %add10, %e12 99 %add12 = add i32 %add11, %e13 100 %add13 = add i32 %add12, %e14 101 %add14 = add i32 %add13, %e15 102 ret i32 %add14 103} 104 105define i32 @reduce_sum_16xi32_prefix2(ptr %p) { 106; CHECK-LABEL: reduce_sum_16xi32_prefix2: 107; CHECK: # %bb.0: 108; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 109; CHECK-NEXT: vle32.v v8, (a0) 110; CHECK-NEXT: vmv.s.x v9, zero 111; CHECK-NEXT: vredsum.vs v8, v8, v9 112; CHECK-NEXT: vmv.x.s a0, v8 113; CHECK-NEXT: ret 114 %v = load <16 x i32>, ptr %p, align 256 115 %e0 = extractelement <16 x i32> %v, i32 0 116 %e1 = extractelement <16 x i32> %v, i32 1 117 %add0 = add i32 %e0, %e1 118 ret i32 %add0 119} 120 121define i32 @reduce_sum_16xi32_prefix3(ptr %p) { 122; CHECK-LABEL: reduce_sum_16xi32_prefix3: 123; CHECK: # %bb.0: 124; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma 125; CHECK-NEXT: vle32.v v8, (a0) 126; CHECK-NEXT: vmv.s.x v9, zero 127; CHECK-NEXT: vredsum.vs v8, v8, v9 128; CHECK-NEXT: vmv.x.s a0, v8 129; CHECK-NEXT: ret 130 %v = load <16 x i32>, ptr %p, align 256 131 %e0 = extractelement <16 x i32> %v, i32 0 132 %e1 = extractelement <16 x i32> %v, i32 1 133 %e2 = extractelement <16 x i32> %v, i32 2 134 %add0 = add i32 %e0, %e1 135 %add1 = add i32 %add0, %e2 136 ret i32 %add1 137} 138 139define i32 @reduce_sum_16xi32_prefix4(ptr %p) { 140; CHECK-LABEL: reduce_sum_16xi32_prefix4: 141; CHECK: # %bb.0: 142; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 143; CHECK-NEXT: vle32.v v8, (a0) 144; CHECK-NEXT: vmv.s.x v9, zero 145; CHECK-NEXT: vredsum.vs v8, v8, v9 146; CHECK-NEXT: vmv.x.s a0, v8 147; CHECK-NEXT: ret 148 %v = load <16 x i32>, ptr %p, align 256 149 %e0 = extractelement <16 x i32> %v, i32 0 150 %e1 = extractelement <16 x i32> %v, i32 1 151 %e2 = extractelement <16 x i32> %v, i32 2 152 %e3 = extractelement <16 x i32> %v, i32 3 153 %add0 = add i32 %e0, %e1 154 %add1 = add i32 %add0, %e2 155 %add2 = add i32 %add1, %e3 156 ret i32 %add2 157} 158 159define i32 @reduce_sum_16xi32_prefix5(ptr %p) { 160; CHECK-LABEL: reduce_sum_16xi32_prefix5: 161; CHECK: # %bb.0: 162; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 163; CHECK-NEXT: vle32.v v8, (a0) 164; CHECK-NEXT: vmv.s.x v10, zero 165; CHECK-NEXT: vredsum.vs v8, v8, v10 166; CHECK-NEXT: vmv.x.s a0, v8 167; CHECK-NEXT: ret 168 %v = load <16 x i32>, ptr %p, align 256 169 %e0 = extractelement <16 x i32> %v, i32 0 170 %e1 = extractelement <16 x i32> %v, i32 1 171 %e2 = extractelement <16 x i32> %v, i32 2 172 %e3 = extractelement <16 x i32> %v, i32 3 173 %e4 = extractelement <16 x i32> %v, i32 4 174 %add0 = add i32 %e0, %e1 175 %add1 = add i32 %add0, %e2 176 %add2 = add i32 %add1, %e3 177 %add3 = add i32 %add2, %e4 178 ret i32 %add3 179} 180 181define i32 @reduce_sum_16xi32_prefix6(ptr %p) { 182; CHECK-LABEL: reduce_sum_16xi32_prefix6: 183; CHECK: # %bb.0: 184; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma 185; CHECK-NEXT: vle32.v v8, (a0) 186; CHECK-NEXT: vmv.s.x v10, zero 187; CHECK-NEXT: vredsum.vs v8, v8, v10 188; CHECK-NEXT: vmv.x.s a0, v8 189; CHECK-NEXT: ret 190 %v = load <16 x i32>, ptr %p, align 256 191 %e0 = extractelement <16 x i32> %v, i32 0 192 %e1 = extractelement <16 x i32> %v, i32 1 193 %e2 = extractelement <16 x i32> %v, i32 2 194 %e3 = extractelement <16 x i32> %v, i32 3 195 %e4 = extractelement <16 x i32> %v, i32 4 196 %e5 = extractelement <16 x i32> %v, i32 5 197 %add0 = add i32 %e0, %e1 198 %add1 = add i32 %add0, %e2 199 %add2 = add i32 %add1, %e3 200 %add3 = add i32 %add2, %e4 201 %add4 = add i32 %add3, %e5 202 ret i32 %add4 203} 204 205define i32 @reduce_sum_16xi32_prefix7(ptr %p) { 206; CHECK-LABEL: reduce_sum_16xi32_prefix7: 207; CHECK: # %bb.0: 208; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma 209; CHECK-NEXT: vle32.v v8, (a0) 210; CHECK-NEXT: vmv.s.x v10, zero 211; CHECK-NEXT: vredsum.vs v8, v8, v10 212; CHECK-NEXT: vmv.x.s a0, v8 213; CHECK-NEXT: ret 214 %v = load <16 x i32>, ptr %p, align 256 215 %e0 = extractelement <16 x i32> %v, i32 0 216 %e1 = extractelement <16 x i32> %v, i32 1 217 %e2 = extractelement <16 x i32> %v, i32 2 218 %e3 = extractelement <16 x i32> %v, i32 3 219 %e4 = extractelement <16 x i32> %v, i32 4 220 %e5 = extractelement <16 x i32> %v, i32 5 221 %e6 = extractelement <16 x i32> %v, i32 6 222 %add0 = add i32 %e0, %e1 223 %add1 = add i32 %add0, %e2 224 %add2 = add i32 %add1, %e3 225 %add3 = add i32 %add2, %e4 226 %add4 = add i32 %add3, %e5 227 %add5 = add i32 %add4, %e6 228 ret i32 %add5 229} 230 231define i32 @reduce_sum_16xi32_prefix8(ptr %p) { 232; CHECK-LABEL: reduce_sum_16xi32_prefix8: 233; CHECK: # %bb.0: 234; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 235; CHECK-NEXT: vle32.v v8, (a0) 236; CHECK-NEXT: vmv.s.x v10, zero 237; CHECK-NEXT: vredsum.vs v8, v8, v10 238; CHECK-NEXT: vmv.x.s a0, v8 239; CHECK-NEXT: ret 240 %v = load <16 x i32>, ptr %p, align 256 241 %e0 = extractelement <16 x i32> %v, i32 0 242 %e1 = extractelement <16 x i32> %v, i32 1 243 %e2 = extractelement <16 x i32> %v, i32 2 244 %e3 = extractelement <16 x i32> %v, i32 3 245 %e4 = extractelement <16 x i32> %v, i32 4 246 %e5 = extractelement <16 x i32> %v, i32 5 247 %e6 = extractelement <16 x i32> %v, i32 6 248 %e7 = extractelement <16 x i32> %v, i32 7 249 %add0 = add i32 %e0, %e1 250 %add1 = add i32 %add0, %e2 251 %add2 = add i32 %add1, %e3 252 %add3 = add i32 %add2, %e4 253 %add4 = add i32 %add3, %e5 254 %add5 = add i32 %add4, %e6 255 %add6 = add i32 %add5, %e7 256 ret i32 %add6 257} 258 259define i32 @reduce_sum_16xi32_prefix9(ptr %p) { 260; CHECK-LABEL: reduce_sum_16xi32_prefix9: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma 263; CHECK-NEXT: vle32.v v8, (a0) 264; CHECK-NEXT: vmv.s.x v12, zero 265; CHECK-NEXT: vredsum.vs v8, v8, v12 266; CHECK-NEXT: vmv.x.s a0, v8 267; CHECK-NEXT: ret 268 %v = load <16 x i32>, ptr %p, align 256 269 %e0 = extractelement <16 x i32> %v, i32 0 270 %e1 = extractelement <16 x i32> %v, i32 1 271 %e2 = extractelement <16 x i32> %v, i32 2 272 %e3 = extractelement <16 x i32> %v, i32 3 273 %e4 = extractelement <16 x i32> %v, i32 4 274 %e5 = extractelement <16 x i32> %v, i32 5 275 %e6 = extractelement <16 x i32> %v, i32 6 276 %e7 = extractelement <16 x i32> %v, i32 7 277 %e8 = extractelement <16 x i32> %v, i32 8 278 %add0 = add i32 %e0, %e1 279 %add1 = add i32 %add0, %e2 280 %add2 = add i32 %add1, %e3 281 %add3 = add i32 %add2, %e4 282 %add4 = add i32 %add3, %e5 283 %add5 = add i32 %add4, %e6 284 %add6 = add i32 %add5, %e7 285 %add7 = add i32 %add6, %e8 286 ret i32 %add7 287} 288 289define i32 @reduce_sum_16xi32_prefix13(ptr %p) { 290; CHECK-LABEL: reduce_sum_16xi32_prefix13: 291; CHECK: # %bb.0: 292; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma 293; CHECK-NEXT: vle32.v v8, (a0) 294; CHECK-NEXT: vmv.s.x v12, zero 295; CHECK-NEXT: vredsum.vs v8, v8, v12 296; CHECK-NEXT: vmv.x.s a0, v8 297; CHECK-NEXT: ret 298 %v = load <16 x i32>, ptr %p, align 256 299 %e0 = extractelement <16 x i32> %v, i32 0 300 %e1 = extractelement <16 x i32> %v, i32 1 301 %e2 = extractelement <16 x i32> %v, i32 2 302 %e3 = extractelement <16 x i32> %v, i32 3 303 %e4 = extractelement <16 x i32> %v, i32 4 304 %e5 = extractelement <16 x i32> %v, i32 5 305 %e6 = extractelement <16 x i32> %v, i32 6 306 %e7 = extractelement <16 x i32> %v, i32 7 307 %e8 = extractelement <16 x i32> %v, i32 8 308 %e9 = extractelement <16 x i32> %v, i32 9 309 %e10 = extractelement <16 x i32> %v, i32 10 310 %e11 = extractelement <16 x i32> %v, i32 11 311 %e12 = extractelement <16 x i32> %v, i32 12 312 %add0 = add i32 %e0, %e1 313 %add1 = add i32 %add0, %e2 314 %add2 = add i32 %add1, %e3 315 %add3 = add i32 %add2, %e4 316 %add4 = add i32 %add3, %e5 317 %add5 = add i32 %add4, %e6 318 %add6 = add i32 %add5, %e7 319 %add7 = add i32 %add6, %e8 320 %add8 = add i32 %add7, %e9 321 %add9 = add i32 %add8, %e10 322 %add10 = add i32 %add9, %e11 323 %add11 = add i32 %add10, %e12 324 ret i32 %add11 325} 326 327 328define i32 @reduce_sum_16xi32_prefix14(ptr %p) { 329; CHECK-LABEL: reduce_sum_16xi32_prefix14: 330; CHECK: # %bb.0: 331; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma 332; CHECK-NEXT: vle32.v v8, (a0) 333; CHECK-NEXT: vmv.s.x v12, zero 334; CHECK-NEXT: vredsum.vs v8, v8, v12 335; CHECK-NEXT: vmv.x.s a0, v8 336; CHECK-NEXT: ret 337 %v = load <16 x i32>, ptr %p, align 256 338 %e0 = extractelement <16 x i32> %v, i32 0 339 %e1 = extractelement <16 x i32> %v, i32 1 340 %e2 = extractelement <16 x i32> %v, i32 2 341 %e3 = extractelement <16 x i32> %v, i32 3 342 %e4 = extractelement <16 x i32> %v, i32 4 343 %e5 = extractelement <16 x i32> %v, i32 5 344 %e6 = extractelement <16 x i32> %v, i32 6 345 %e7 = extractelement <16 x i32> %v, i32 7 346 %e8 = extractelement <16 x i32> %v, i32 8 347 %e9 = extractelement <16 x i32> %v, i32 9 348 %e10 = extractelement <16 x i32> %v, i32 10 349 %e11 = extractelement <16 x i32> %v, i32 11 350 %e12 = extractelement <16 x i32> %v, i32 12 351 %e13 = extractelement <16 x i32> %v, i32 13 352 %add0 = add i32 %e0, %e1 353 %add1 = add i32 %add0, %e2 354 %add2 = add i32 %add1, %e3 355 %add3 = add i32 %add2, %e4 356 %add4 = add i32 %add3, %e5 357 %add5 = add i32 %add4, %e6 358 %add6 = add i32 %add5, %e7 359 %add7 = add i32 %add6, %e8 360 %add8 = add i32 %add7, %e9 361 %add9 = add i32 %add8, %e10 362 %add10 = add i32 %add9, %e11 363 %add11 = add i32 %add10, %e12 364 %add12 = add i32 %add11, %e13 365 ret i32 %add12 366} 367 368define i32 @reduce_sum_16xi32_prefix15(ptr %p) { 369; CHECK-LABEL: reduce_sum_16xi32_prefix15: 370; CHECK: # %bb.0: 371; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma 372; CHECK-NEXT: vle32.v v8, (a0) 373; CHECK-NEXT: vmv.s.x v12, zero 374; CHECK-NEXT: vredsum.vs v8, v8, v12 375; CHECK-NEXT: vmv.x.s a0, v8 376; CHECK-NEXT: ret 377 %v = load <16 x i32>, ptr %p, align 256 378 %e0 = extractelement <16 x i32> %v, i32 0 379 %e1 = extractelement <16 x i32> %v, i32 1 380 %e2 = extractelement <16 x i32> %v, i32 2 381 %e3 = extractelement <16 x i32> %v, i32 3 382 %e4 = extractelement <16 x i32> %v, i32 4 383 %e5 = extractelement <16 x i32> %v, i32 5 384 %e6 = extractelement <16 x i32> %v, i32 6 385 %e7 = extractelement <16 x i32> %v, i32 7 386 %e8 = extractelement <16 x i32> %v, i32 8 387 %e9 = extractelement <16 x i32> %v, i32 9 388 %e10 = extractelement <16 x i32> %v, i32 10 389 %e11 = extractelement <16 x i32> %v, i32 11 390 %e12 = extractelement <16 x i32> %v, i32 12 391 %e13 = extractelement <16 x i32> %v, i32 13 392 %e14 = extractelement <16 x i32> %v, i32 14 393 %add0 = add i32 %e0, %e1 394 %add1 = add i32 %add0, %e2 395 %add2 = add i32 %add1, %e3 396 %add3 = add i32 %add2, %e4 397 %add4 = add i32 %add3, %e5 398 %add5 = add i32 %add4, %e6 399 %add6 = add i32 %add5, %e7 400 %add7 = add i32 %add6, %e8 401 %add8 = add i32 %add7, %e9 402 %add9 = add i32 %add8, %e10 403 %add10 = add i32 %add9, %e11 404 %add11 = add i32 %add10, %e12 405 %add12 = add i32 %add11, %e13 406 %add13 = add i32 %add12, %e14 407 ret i32 %add13 408} 409 410; Check that we can match with the operand ordered reversed, but the 411; reduction order unchanged. 412define i32 @reduce_sum_4xi32_op_order(<4 x i32> %v) { 413; CHECK-LABEL: reduce_sum_4xi32_op_order: 414; CHECK: # %bb.0: 415; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 416; CHECK-NEXT: vmv.s.x v9, zero 417; CHECK-NEXT: vredsum.vs v8, v8, v9 418; CHECK-NEXT: vmv.x.s a0, v8 419; CHECK-NEXT: ret 420 %e0 = extractelement <4 x i32> %v, i32 0 421 %e1 = extractelement <4 x i32> %v, i32 1 422 %e2 = extractelement <4 x i32> %v, i32 2 423 %e3 = extractelement <4 x i32> %v, i32 3 424 %add0 = add i32 %e1, %e0 425 %add1 = add i32 %e2, %add0 426 %add2 = add i32 %add1, %e3 427 ret i32 %add2 428} 429 430; Negative test - Reduction order isn't compatibile with current 431; incremental matching scheme. 432define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) { 433; RV32-LABEL: reduce_sum_4xi32_reduce_order: 434; RV32: # %bb.0: 435; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma 436; RV32-NEXT: vmv.x.s a0, v8 437; RV32-NEXT: vslidedown.vi v9, v8, 1 438; RV32-NEXT: vmv.x.s a1, v9 439; RV32-NEXT: vslidedown.vi v9, v8, 2 440; RV32-NEXT: vslidedown.vi v8, v8, 3 441; RV32-NEXT: vmv.x.s a2, v9 442; RV32-NEXT: vmv.x.s a3, v8 443; RV32-NEXT: add a1, a1, a2 444; RV32-NEXT: add a0, a0, a3 445; RV32-NEXT: add a0, a0, a1 446; RV32-NEXT: ret 447; 448; RV64-LABEL: reduce_sum_4xi32_reduce_order: 449; RV64: # %bb.0: 450; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma 451; RV64-NEXT: vmv.x.s a0, v8 452; RV64-NEXT: vslidedown.vi v9, v8, 1 453; RV64-NEXT: vmv.x.s a1, v9 454; RV64-NEXT: vslidedown.vi v9, v8, 2 455; RV64-NEXT: vslidedown.vi v8, v8, 3 456; RV64-NEXT: vmv.x.s a2, v9 457; RV64-NEXT: vmv.x.s a3, v8 458; RV64-NEXT: add a1, a1, a2 459; RV64-NEXT: add a0, a0, a3 460; RV64-NEXT: addw a0, a0, a1 461; RV64-NEXT: ret 462 %e0 = extractelement <4 x i32> %v, i32 0 463 %e1 = extractelement <4 x i32> %v, i32 1 464 %e2 = extractelement <4 x i32> %v, i32 2 465 %e3 = extractelement <4 x i32> %v, i32 3 466 %add0 = add i32 %e1, %e2 467 %add1 = add i32 %e0, %add0 468 %add2 = add i32 %add1, %e3 469 ret i32 %add2 470} 471 472;; Most of the cornercases are exercised above, the following just 473;; makes sure that other opcodes work as expected. 474 475define i32 @reduce_xor_16xi32_prefix2(ptr %p) { 476; CHECK-LABEL: reduce_xor_16xi32_prefix2: 477; CHECK: # %bb.0: 478; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 479; CHECK-NEXT: vle32.v v8, (a0) 480; CHECK-NEXT: vmv.s.x v9, zero 481; CHECK-NEXT: vredxor.vs v8, v8, v9 482; CHECK-NEXT: vmv.x.s a0, v8 483; CHECK-NEXT: ret 484 %v = load <16 x i32>, ptr %p, align 256 485 %e0 = extractelement <16 x i32> %v, i32 0 486 %e1 = extractelement <16 x i32> %v, i32 1 487 %xor0 = xor i32 %e0, %e1 488 ret i32 %xor0 489} 490 491define i32 @reduce_xor_16xi32_prefix5(ptr %p) { 492; CHECK-LABEL: reduce_xor_16xi32_prefix5: 493; CHECK: # %bb.0: 494; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 495; CHECK-NEXT: vle32.v v8, (a0) 496; CHECK-NEXT: vmv.s.x v10, zero 497; CHECK-NEXT: vredxor.vs v8, v8, v10 498; CHECK-NEXT: vmv.x.s a0, v8 499; CHECK-NEXT: ret 500 %v = load <16 x i32>, ptr %p, align 256 501 %e0 = extractelement <16 x i32> %v, i32 0 502 %e1 = extractelement <16 x i32> %v, i32 1 503 %e2 = extractelement <16 x i32> %v, i32 2 504 %e3 = extractelement <16 x i32> %v, i32 3 505 %e4 = extractelement <16 x i32> %v, i32 4 506 %xor0 = xor i32 %e0, %e1 507 %xor1 = xor i32 %xor0, %e2 508 %xor2 = xor i32 %xor1, %e3 509 %xor3 = xor i32 %xor2, %e4 510 ret i32 %xor3 511} 512 513define i32 @reduce_and_16xi32_prefix2(ptr %p) { 514; CHECK-LABEL: reduce_and_16xi32_prefix2: 515; CHECK: # %bb.0: 516; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 517; CHECK-NEXT: vle32.v v8, (a0) 518; CHECK-NEXT: vredand.vs v8, v8, v8 519; CHECK-NEXT: vmv.x.s a0, v8 520; CHECK-NEXT: ret 521 %v = load <16 x i32>, ptr %p, align 256 522 %e0 = extractelement <16 x i32> %v, i32 0 523 %e1 = extractelement <16 x i32> %v, i32 1 524 %and0 = and i32 %e0, %e1 525 ret i32 %and0 526} 527 528define i32 @reduce_and_16xi32_prefix5(ptr %p) { 529; CHECK-LABEL: reduce_and_16xi32_prefix5: 530; CHECK: # %bb.0: 531; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 532; CHECK-NEXT: vle32.v v8, (a0) 533; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 534; CHECK-NEXT: vmv.v.i v10, -1 535; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 536; CHECK-NEXT: vredand.vs v8, v8, v10 537; CHECK-NEXT: vmv.x.s a0, v8 538; CHECK-NEXT: ret 539 %v = load <16 x i32>, ptr %p, align 256 540 %e0 = extractelement <16 x i32> %v, i32 0 541 %e1 = extractelement <16 x i32> %v, i32 1 542 %e2 = extractelement <16 x i32> %v, i32 2 543 %e3 = extractelement <16 x i32> %v, i32 3 544 %e4 = extractelement <16 x i32> %v, i32 4 545 %and0 = and i32 %e0, %e1 546 %and1 = and i32 %and0, %e2 547 %and2 = and i32 %and1, %e3 548 %and3 = and i32 %and2, %e4 549 ret i32 %and3 550} 551 552define i32 @reduce_or_16xi32_prefix2(ptr %p) { 553; CHECK-LABEL: reduce_or_16xi32_prefix2: 554; CHECK: # %bb.0: 555; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 556; CHECK-NEXT: vle32.v v8, (a0) 557; CHECK-NEXT: vredor.vs v8, v8, v8 558; CHECK-NEXT: vmv.x.s a0, v8 559; CHECK-NEXT: ret 560 %v = load <16 x i32>, ptr %p, align 256 561 %e0 = extractelement <16 x i32> %v, i32 0 562 %e1 = extractelement <16 x i32> %v, i32 1 563 %or0 = or i32 %e0, %e1 564 ret i32 %or0 565} 566 567define i32 @reduce_or_16xi32_prefix5(ptr %p) { 568; CHECK-LABEL: reduce_or_16xi32_prefix5: 569; CHECK: # %bb.0: 570; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 571; CHECK-NEXT: vle32.v v8, (a0) 572; CHECK-NEXT: vmv.s.x v10, zero 573; CHECK-NEXT: vredor.vs v8, v8, v10 574; CHECK-NEXT: vmv.x.s a0, v8 575; CHECK-NEXT: ret 576 %v = load <16 x i32>, ptr %p, align 256 577 %e0 = extractelement <16 x i32> %v, i32 0 578 %e1 = extractelement <16 x i32> %v, i32 1 579 %e2 = extractelement <16 x i32> %v, i32 2 580 %e3 = extractelement <16 x i32> %v, i32 3 581 %e4 = extractelement <16 x i32> %v, i32 4 582 %or0 = or i32 %e0, %e1 583 %or1 = or i32 %or0, %e2 584 %or2 = or i32 %or1, %e3 585 %or3 = or i32 %or2, %e4 586 ret i32 %or3 587} 588 589declare i32 @llvm.smax.i32(i32 %a, i32 %b) 590declare i32 @llvm.smin.i32(i32 %a, i32 %b) 591declare i32 @llvm.umax.i32(i32 %a, i32 %b) 592declare i32 @llvm.umin.i32(i32 %a, i32 %b) 593 594define i32 @reduce_smax_16xi32_prefix2(ptr %p) { 595; CHECK-LABEL: reduce_smax_16xi32_prefix2: 596; CHECK: # %bb.0: 597; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 598; CHECK-NEXT: vle32.v v8, (a0) 599; CHECK-NEXT: vredmax.vs v8, v8, v8 600; CHECK-NEXT: vmv.x.s a0, v8 601; CHECK-NEXT: ret 602 %v = load <16 x i32>, ptr %p, align 256 603 %e0 = extractelement <16 x i32> %v, i32 0 604 %e1 = extractelement <16 x i32> %v, i32 1 605 %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1) 606 ret i32 %smax0 607} 608 609define i32 @reduce_smax_16xi32_prefix5(ptr %p) { 610; CHECK-LABEL: reduce_smax_16xi32_prefix5: 611; CHECK: # %bb.0: 612; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 613; CHECK-NEXT: vle32.v v8, (a0) 614; CHECK-NEXT: lui a0, 524288 615; CHECK-NEXT: vmv.s.x v10, a0 616; CHECK-NEXT: vredmax.vs v8, v8, v10 617; CHECK-NEXT: vmv.x.s a0, v8 618; CHECK-NEXT: ret 619 %v = load <16 x i32>, ptr %p, align 256 620 %e0 = extractelement <16 x i32> %v, i32 0 621 %e1 = extractelement <16 x i32> %v, i32 1 622 %e2 = extractelement <16 x i32> %v, i32 2 623 %e3 = extractelement <16 x i32> %v, i32 3 624 %e4 = extractelement <16 x i32> %v, i32 4 625 %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1) 626 %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2) 627 %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3) 628 %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4) 629 ret i32 %smax3 630} 631 632define i32 @reduce_smin_16xi32_prefix2(ptr %p) { 633; CHECK-LABEL: reduce_smin_16xi32_prefix2: 634; CHECK: # %bb.0: 635; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 636; CHECK-NEXT: vle32.v v8, (a0) 637; CHECK-NEXT: vredmin.vs v8, v8, v8 638; CHECK-NEXT: vmv.x.s a0, v8 639; CHECK-NEXT: ret 640 %v = load <16 x i32>, ptr %p, align 256 641 %e0 = extractelement <16 x i32> %v, i32 0 642 %e1 = extractelement <16 x i32> %v, i32 1 643 %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1) 644 ret i32 %smin0 645} 646 647define i32 @reduce_smin_16xi32_prefix5(ptr %p) { 648; CHECK-LABEL: reduce_smin_16xi32_prefix5: 649; CHECK: # %bb.0: 650; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 651; CHECK-NEXT: vle32.v v8, (a0) 652; CHECK-NEXT: lui a0, 524288 653; CHECK-NEXT: addi a0, a0, -1 654; CHECK-NEXT: vmv.s.x v10, a0 655; CHECK-NEXT: vredmin.vs v8, v8, v10 656; CHECK-NEXT: vmv.x.s a0, v8 657; CHECK-NEXT: ret 658 %v = load <16 x i32>, ptr %p, align 256 659 %e0 = extractelement <16 x i32> %v, i32 0 660 %e1 = extractelement <16 x i32> %v, i32 1 661 %e2 = extractelement <16 x i32> %v, i32 2 662 %e3 = extractelement <16 x i32> %v, i32 3 663 %e4 = extractelement <16 x i32> %v, i32 4 664 %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1) 665 %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2) 666 %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3) 667 %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4) 668 ret i32 %smin3 669} 670 671define i32 @reduce_umax_16xi32_prefix2(ptr %p) { 672; CHECK-LABEL: reduce_umax_16xi32_prefix2: 673; CHECK: # %bb.0: 674; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 675; CHECK-NEXT: vle32.v v8, (a0) 676; CHECK-NEXT: vredmaxu.vs v8, v8, v8 677; CHECK-NEXT: vmv.x.s a0, v8 678; CHECK-NEXT: ret 679 %v = load <16 x i32>, ptr %p, align 256 680 %e0 = extractelement <16 x i32> %v, i32 0 681 %e1 = extractelement <16 x i32> %v, i32 1 682 %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1) 683 ret i32 %umax0 684} 685 686define i32 @reduce_umax_16xi32_prefix5(ptr %p) { 687; CHECK-LABEL: reduce_umax_16xi32_prefix5: 688; CHECK: # %bb.0: 689; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 690; CHECK-NEXT: vle32.v v8, (a0) 691; CHECK-NEXT: vmv.s.x v10, zero 692; CHECK-NEXT: vredmaxu.vs v8, v8, v10 693; CHECK-NEXT: vmv.x.s a0, v8 694; CHECK-NEXT: ret 695 %v = load <16 x i32>, ptr %p, align 256 696 %e0 = extractelement <16 x i32> %v, i32 0 697 %e1 = extractelement <16 x i32> %v, i32 1 698 %e2 = extractelement <16 x i32> %v, i32 2 699 %e3 = extractelement <16 x i32> %v, i32 3 700 %e4 = extractelement <16 x i32> %v, i32 4 701 %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1) 702 %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2) 703 %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3) 704 %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4) 705 ret i32 %umax3 706} 707 708define i32 @reduce_umin_16xi32_prefix2(ptr %p) { 709; CHECK-LABEL: reduce_umin_16xi32_prefix2: 710; CHECK: # %bb.0: 711; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 712; CHECK-NEXT: vle32.v v8, (a0) 713; CHECK-NEXT: vredminu.vs v8, v8, v8 714; CHECK-NEXT: vmv.x.s a0, v8 715; CHECK-NEXT: ret 716 %v = load <16 x i32>, ptr %p, align 256 717 %e0 = extractelement <16 x i32> %v, i32 0 718 %e1 = extractelement <16 x i32> %v, i32 1 719 %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1) 720 ret i32 %umin0 721} 722 723define i32 @reduce_umin_16xi32_prefix5(ptr %p) { 724; RV32-LABEL: reduce_umin_16xi32_prefix5: 725; RV32: # %bb.0: 726; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma 727; RV32-NEXT: vle32.v v8, (a0) 728; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma 729; RV32-NEXT: vmv.v.i v10, -1 730; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma 731; RV32-NEXT: vredminu.vs v8, v8, v10 732; RV32-NEXT: vmv.x.s a0, v8 733; RV32-NEXT: ret 734; 735; RV64-LABEL: reduce_umin_16xi32_prefix5: 736; RV64: # %bb.0: 737; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma 738; RV64-NEXT: vle32.v v8, (a0) 739; RV64-NEXT: li a0, -1 740; RV64-NEXT: vmv.s.x v10, a0 741; RV64-NEXT: vredminu.vs v8, v8, v10 742; RV64-NEXT: vmv.x.s a0, v8 743; RV64-NEXT: ret 744 %v = load <16 x i32>, ptr %p, align 256 745 %e0 = extractelement <16 x i32> %v, i32 0 746 %e1 = extractelement <16 x i32> %v, i32 1 747 %e2 = extractelement <16 x i32> %v, i32 2 748 %e3 = extractelement <16 x i32> %v, i32 3 749 %e4 = extractelement <16 x i32> %v, i32 4 750 %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1) 751 %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2) 752 %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3) 753 %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4) 754 ret i32 %umin3 755} 756 757define float @reduce_fadd_16xf32_prefix2(ptr %p) { 758; CHECK-LABEL: reduce_fadd_16xf32_prefix2: 759; CHECK: # %bb.0: 760; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 761; CHECK-NEXT: vle32.v v8, (a0) 762; CHECK-NEXT: vmv.s.x v9, zero 763; CHECK-NEXT: vfredusum.vs v8, v8, v9 764; CHECK-NEXT: vfmv.f.s fa0, v8 765; CHECK-NEXT: ret 766 %v = load <16 x float>, ptr %p, align 256 767 %e0 = extractelement <16 x float> %v, i32 0 768 %e1 = extractelement <16 x float> %v, i32 1 769 %fadd0 = fadd fast float %e0, %e1 770 ret float %fadd0 771} 772 773define float @reduce_fadd_16xi32_prefix5(ptr %p) { 774; CHECK-LABEL: reduce_fadd_16xi32_prefix5: 775; CHECK: # %bb.0: 776; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma 777; CHECK-NEXT: vle32.v v8, (a0) 778; CHECK-NEXT: lui a0, 524288 779; CHECK-NEXT: vmv.s.x v10, a0 780; CHECK-NEXT: vfredusum.vs v8, v8, v10 781; CHECK-NEXT: vfmv.f.s fa0, v8 782; CHECK-NEXT: ret 783 %v = load <16 x float>, ptr %p, align 256 784 %e0 = extractelement <16 x float> %v, i32 0 785 %e1 = extractelement <16 x float> %v, i32 1 786 %e2 = extractelement <16 x float> %v, i32 2 787 %e3 = extractelement <16 x float> %v, i32 3 788 %e4 = extractelement <16 x float> %v, i32 4 789 %fadd0 = fadd fast float %e0, %e1 790 %fadd1 = fadd fast float %fadd0, %e2 791 %fadd2 = fadd fast float %fadd1, %e3 792 %fadd3 = fadd fast float %fadd2, %e4 793 ret float %fadd3 794} 795 796;; Corner case tests for fadd associativity 797 798; Negative test, not associative. Would need strict opcode. 799define float @reduce_fadd_2xf32_non_associative(ptr %p) { 800; CHECK-LABEL: reduce_fadd_2xf32_non_associative: 801; CHECK: # %bb.0: 802; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 803; CHECK-NEXT: vle32.v v8, (a0) 804; CHECK-NEXT: vfmv.f.s fa5, v8 805; CHECK-NEXT: vslidedown.vi v8, v8, 1 806; CHECK-NEXT: vfmv.f.s fa4, v8 807; CHECK-NEXT: fadd.s fa0, fa5, fa4 808; CHECK-NEXT: ret 809 %v = load <2 x float>, ptr %p, align 256 810 %e0 = extractelement <2 x float> %v, i32 0 811 %e1 = extractelement <2 x float> %v, i32 1 812 %fadd0 = fadd float %e0, %e1 813 ret float %fadd0 814} 815 816; Positive test - minimal set of fast math flags 817define float @reduce_fadd_2xf32_reassoc_only(ptr %p) { 818; CHECK-LABEL: reduce_fadd_2xf32_reassoc_only: 819; CHECK: # %bb.0: 820; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 821; CHECK-NEXT: vle32.v v8, (a0) 822; CHECK-NEXT: lui a0, 524288 823; CHECK-NEXT: vmv.s.x v9, a0 824; CHECK-NEXT: vfredusum.vs v8, v8, v9 825; CHECK-NEXT: vfmv.f.s fa0, v8 826; CHECK-NEXT: ret 827 %v = load <2 x float>, ptr %p, align 256 828 %e0 = extractelement <2 x float> %v, i32 0 829 %e1 = extractelement <2 x float> %v, i32 1 830 %fadd0 = fadd reassoc float %e0, %e1 831 ret float %fadd0 832} 833 834; Negative test - wrong fast math flag. 835define float @reduce_fadd_2xf32_ninf_only(ptr %p) { 836; CHECK-LABEL: reduce_fadd_2xf32_ninf_only: 837; CHECK: # %bb.0: 838; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 839; CHECK-NEXT: vle32.v v8, (a0) 840; CHECK-NEXT: vfmv.f.s fa5, v8 841; CHECK-NEXT: vslidedown.vi v8, v8, 1 842; CHECK-NEXT: vfmv.f.s fa4, v8 843; CHECK-NEXT: fadd.s fa0, fa5, fa4 844; CHECK-NEXT: ret 845 %v = load <2 x float>, ptr %p, align 256 846 %e0 = extractelement <2 x float> %v, i32 0 847 %e1 = extractelement <2 x float> %v, i32 1 848 %fadd0 = fadd ninf float %e0, %e1 849 ret float %fadd0 850} 851 852 853; Negative test - last fadd is not associative 854define float @reduce_fadd_4xi32_non_associative(ptr %p) { 855; CHECK-LABEL: reduce_fadd_4xi32_non_associative: 856; CHECK: # %bb.0: 857; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 858; CHECK-NEXT: vle32.v v8, (a0) 859; CHECK-NEXT: lui a0, 524288 860; CHECK-NEXT: vmv.s.x v9, a0 861; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma 862; CHECK-NEXT: vfredusum.vs v9, v8, v9 863; CHECK-NEXT: vslidedown.vi v8, v8, 3 864; CHECK-NEXT: vfmv.f.s fa5, v8 865; CHECK-NEXT: vfmv.f.s fa4, v9 866; CHECK-NEXT: fadd.s fa0, fa4, fa5 867; CHECK-NEXT: ret 868 %v = load <4 x float>, ptr %p, align 256 869 %e0 = extractelement <4 x float> %v, i32 0 870 %e1 = extractelement <4 x float> %v, i32 1 871 %e2 = extractelement <4 x float> %v, i32 2 872 %e3 = extractelement <4 x float> %v, i32 3 873 %fadd0 = fadd fast float %e0, %e1 874 %fadd1 = fadd fast float %fadd0, %e2 875 %fadd2 = fadd float %fadd1, %e3 876 ret float %fadd2 877} 878 879; Negative test - first fadd is not associative 880; We could form a reduce for elements 2 and 3. 881define float @reduce_fadd_4xi32_non_associative2(ptr %p) { 882; CHECK-LABEL: reduce_fadd_4xi32_non_associative2: 883; CHECK: # %bb.0: 884; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 885; CHECK-NEXT: vle32.v v8, (a0) 886; CHECK-NEXT: vfmv.f.s fa5, v8 887; CHECK-NEXT: vslidedown.vi v9, v8, 1 888; CHECK-NEXT: vfmv.f.s fa4, v9 889; CHECK-NEXT: vslidedown.vi v9, v8, 2 890; CHECK-NEXT: vslidedown.vi v8, v8, 3 891; CHECK-NEXT: vfmv.f.s fa3, v9 892; CHECK-NEXT: vfmv.f.s fa2, v8 893; CHECK-NEXT: fadd.s fa5, fa5, fa4 894; CHECK-NEXT: fadd.s fa4, fa3, fa2 895; CHECK-NEXT: fadd.s fa0, fa5, fa4 896; CHECK-NEXT: ret 897 %v = load <4 x float>, ptr %p, align 256 898 %e0 = extractelement <4 x float> %v, i32 0 899 %e1 = extractelement <4 x float> %v, i32 1 900 %e2 = extractelement <4 x float> %v, i32 2 901 %e3 = extractelement <4 x float> %v, i32 3 902 %fadd0 = fadd float %e0, %e1 903 %fadd1 = fadd fast float %fadd0, %e2 904 %fadd2 = fadd fast float %fadd1, %e3 905 ret float %fadd2 906} 907