107ee9bd2SPhilip Reames; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 207ee9bd2SPhilip Reames; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s 307ee9bd2SPhilip Reames; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s 407ee9bd2SPhilip Reames 507ee9bd2SPhilip Reamesdefine signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) { 607ee9bd2SPhilip Reames; CHECK-LABEL: sad_4x8_as_i16: 707ee9bd2SPhilip Reames; CHECK: # %bb.0: # %entry 807ee9bd2SPhilip Reames; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 9a6b870dbSPhilip Reames; CHECK-NEXT: vminu.vv v10, v8, v9 10a6b870dbSPhilip Reames; CHECK-NEXT: vmaxu.vv v8, v8, v9 11a6b870dbSPhilip Reames; CHECK-NEXT: vsub.vv v8, v8, v10 1207ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 1307ee9bd2SPhilip Reames; CHECK-NEXT: vmv.s.x v9, zero 14a6b870dbSPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma 15a6b870dbSPhilip Reames; CHECK-NEXT: vwredsumu.vs v8, v8, v9 16a6b870dbSPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 1707ee9bd2SPhilip Reames; CHECK-NEXT: vmv.x.s a0, v8 1807ee9bd2SPhilip Reames; CHECK-NEXT: ret 1907ee9bd2SPhilip Reamesentry: 2007ee9bd2SPhilip Reames %1 = zext <4 x i8> %a to <4 x i16> 2107ee9bd2SPhilip Reames %3 = zext <4 x i8> %b to <4 x i16> 2207ee9bd2SPhilip Reames %4 = sub nsw <4 x i16> %1, %3 2307ee9bd2SPhilip Reames %5 = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %4, i1 true) 2407ee9bd2SPhilip Reames %6 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %5) 2507ee9bd2SPhilip Reames ret i16 %6 2607ee9bd2SPhilip Reames} 2707ee9bd2SPhilip Reames 2807ee9bd2SPhilip Reamesdefine signext i32 @sad_4x8_as_i32(<4 x i8> %a, <4 x i8> %b) { 2907ee9bd2SPhilip Reames; CHECK-LABEL: sad_4x8_as_i32: 3007ee9bd2SPhilip Reames; CHECK: # %bb.0: # %entry 3107ee9bd2SPhilip Reames; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 32a6b870dbSPhilip Reames; CHECK-NEXT: vminu.vv v10, v8, v9 33a6b870dbSPhilip Reames; CHECK-NEXT: vmaxu.vv v8, v8, v9 34a6b870dbSPhilip Reames; CHECK-NEXT: vsub.vv v8, v8, v10 3507ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma 36a6b870dbSPhilip Reames; CHECK-NEXT: vzext.vf4 v9, v8 37a6b870dbSPhilip Reames; CHECK-NEXT: vmv.s.x v8, zero 38a6b870dbSPhilip Reames; CHECK-NEXT: vredsum.vs v8, v9, v8 3907ee9bd2SPhilip Reames; CHECK-NEXT: vmv.x.s a0, v8 4007ee9bd2SPhilip Reames; CHECK-NEXT: ret 4107ee9bd2SPhilip Reamesentry: 4207ee9bd2SPhilip Reames %1 = zext <4 x i8> %a to <4 x i32> 4307ee9bd2SPhilip Reames %3 = zext <4 x i8> %b to <4 x i32> 4407ee9bd2SPhilip Reames %4 = sub nsw <4 x i32> %1, %3 4507ee9bd2SPhilip Reames %5 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %4, i1 true) 4607ee9bd2SPhilip Reames %6 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 4707ee9bd2SPhilip Reames ret i32 %6 4807ee9bd2SPhilip Reames} 4907ee9bd2SPhilip Reames 5007ee9bd2SPhilip Reamesdefine signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) { 5107ee9bd2SPhilip Reames; CHECK-LABEL: sad_16x8_as_i16: 5207ee9bd2SPhilip Reames; CHECK: # %bb.0: # %entry 5307ee9bd2SPhilip Reames; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 54a6b870dbSPhilip Reames; CHECK-NEXT: vminu.vv v10, v8, v9 55a6b870dbSPhilip Reames; CHECK-NEXT: vmaxu.vv v8, v8, v9 56a6b870dbSPhilip Reames; CHECK-NEXT: vsub.vv v8, v8, v10 5707ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 58a6b870dbSPhilip Reames; CHECK-NEXT: vmv.s.x v9, zero 59a6b870dbSPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma 60a6b870dbSPhilip Reames; CHECK-NEXT: vwredsumu.vs v8, v8, v9 61a6b870dbSPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 6207ee9bd2SPhilip Reames; CHECK-NEXT: vmv.x.s a0, v8 6307ee9bd2SPhilip Reames; CHECK-NEXT: ret 6407ee9bd2SPhilip Reamesentry: 6507ee9bd2SPhilip Reames %1 = zext <16 x i8> %a to <16 x i16> 6607ee9bd2SPhilip Reames %3 = zext <16 x i8> %b to <16 x i16> 6707ee9bd2SPhilip Reames %4 = sub nsw <16 x i16> %1, %3 6807ee9bd2SPhilip Reames %5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 true) 6907ee9bd2SPhilip Reames %6 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5) 7007ee9bd2SPhilip Reames ret i16 %6 7107ee9bd2SPhilip Reames} 7207ee9bd2SPhilip Reames 7307ee9bd2SPhilip Reamesdefine signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) { 7407ee9bd2SPhilip Reames; CHECK-LABEL: sad_16x8_as_i32: 7507ee9bd2SPhilip Reames; CHECK: # %bb.0: # %entry 7607ee9bd2SPhilip Reames; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 77a6b870dbSPhilip Reames; CHECK-NEXT: vminu.vv v10, v8, v9 78a6b870dbSPhilip Reames; CHECK-NEXT: vmaxu.vv v8, v8, v9 79a6b870dbSPhilip Reames; CHECK-NEXT: vsub.vv v8, v8, v10 8007ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma 81a6b870dbSPhilip Reames; CHECK-NEXT: vzext.vf4 v12, v8 82a6b870dbSPhilip Reames; CHECK-NEXT: vmv.s.x v8, zero 83a6b870dbSPhilip Reames; CHECK-NEXT: vredsum.vs v8, v12, v8 8407ee9bd2SPhilip Reames; CHECK-NEXT: vmv.x.s a0, v8 8507ee9bd2SPhilip Reames; CHECK-NEXT: ret 8607ee9bd2SPhilip Reamesentry: 8707ee9bd2SPhilip Reames %1 = zext <16 x i8> %a to <16 x i32> 8807ee9bd2SPhilip Reames %3 = zext <16 x i8> %b to <16 x i32> 8907ee9bd2SPhilip Reames %4 = sub nsw <16 x i32> %1, %3 9007ee9bd2SPhilip Reames %5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true) 9107ee9bd2SPhilip Reames %6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 9207ee9bd2SPhilip Reames ret i32 %6 9307ee9bd2SPhilip Reames} 9407ee9bd2SPhilip Reames 9507ee9bd2SPhilip Reamesdefine signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea, i32 signext %strideb) { 9607ee9bd2SPhilip Reames; CHECK-LABEL: sad_2block_16xi8_as_i32: 9707ee9bd2SPhilip Reames; CHECK: # %bb.0: # %entry 9807ee9bd2SPhilip Reames; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 9907ee9bd2SPhilip Reames; CHECK-NEXT: vle8.v v8, (a0) 10007ee9bd2SPhilip Reames; CHECK-NEXT: vle8.v v9, (a1) 101a6b870dbSPhilip Reames; CHECK-NEXT: add a0, a0, a2 102a6b870dbSPhilip Reames; CHECK-NEXT: add a1, a1, a3 103a6b870dbSPhilip Reames; CHECK-NEXT: vle8.v v10, (a0) 104a6b870dbSPhilip Reames; CHECK-NEXT: vle8.v v11, (a1) 105*9122c523SPengcheng Wang; CHECK-NEXT: add a0, a0, a2 106*9122c523SPengcheng Wang; CHECK-NEXT: add a1, a1, a3 107*9122c523SPengcheng Wang; CHECK-NEXT: vle8.v v12, (a0) 108*9122c523SPengcheng Wang; CHECK-NEXT: vle8.v v13, (a1) 109*9122c523SPengcheng Wang; CHECK-NEXT: add a0, a0, a2 110*9122c523SPengcheng Wang; CHECK-NEXT: add a1, a1, a3 111*9122c523SPengcheng Wang; CHECK-NEXT: vminu.vv v14, v8, v9 112a6b870dbSPhilip Reames; CHECK-NEXT: vmaxu.vv v8, v8, v9 113*9122c523SPengcheng Wang; CHECK-NEXT: vle8.v v9, (a0) 114*9122c523SPengcheng Wang; CHECK-NEXT: vsub.vv v8, v8, v14 115*9122c523SPengcheng Wang; CHECK-NEXT: vminu.vv v14, v10, v11 116a6b870dbSPhilip Reames; CHECK-NEXT: vmaxu.vv v10, v10, v11 117*9122c523SPengcheng Wang; CHECK-NEXT: vle8.v v11, (a1) 118*9122c523SPengcheng Wang; CHECK-NEXT: vsub.vv v10, v10, v14 119*9122c523SPengcheng Wang; CHECK-NEXT: vminu.vv v14, v12, v13 120*9122c523SPengcheng Wang; CHECK-NEXT: vmaxu.vv v12, v12, v13 121*9122c523SPengcheng Wang; CHECK-NEXT: vwaddu.vv v16, v10, v8 122*9122c523SPengcheng Wang; CHECK-NEXT: vsub.vv v8, v12, v14 123*9122c523SPengcheng Wang; CHECK-NEXT: vminu.vv v10, v9, v11 124*9122c523SPengcheng Wang; CHECK-NEXT: vmaxu.vv v9, v9, v11 125675e7bd1SPiyou Chen; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 126*9122c523SPengcheng Wang; CHECK-NEXT: vzext.vf2 v12, v8 12707ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma 128*9122c523SPengcheng Wang; CHECK-NEXT: vsub.vv v8, v9, v10 12907ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 130*9122c523SPengcheng Wang; CHECK-NEXT: vwaddu.vv v20, v12, v16 131a6b870dbSPhilip Reames; CHECK-NEXT: vzext.vf2 v10, v8 132*9122c523SPengcheng Wang; CHECK-NEXT: vwaddu.wv v20, v20, v10 13307ee9bd2SPhilip Reames; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma 134a6b870dbSPhilip Reames; CHECK-NEXT: vmv.s.x v8, zero 135*9122c523SPengcheng Wang; CHECK-NEXT: vredsum.vs v8, v20, v8 13607ee9bd2SPhilip Reames; CHECK-NEXT: vmv.x.s a0, v8 13707ee9bd2SPhilip Reames; CHECK-NEXT: ret 13807ee9bd2SPhilip Reamesentry: 13907ee9bd2SPhilip Reames %idx.ext8 = sext i32 %strideb to i64 14007ee9bd2SPhilip Reames %idx.ext = sext i32 %stridea to i64 14107ee9bd2SPhilip Reames %0 = load <16 x i8>, ptr %a, align 1 14207ee9bd2SPhilip Reames %1 = zext <16 x i8> %0 to <16 x i32> 14307ee9bd2SPhilip Reames %2 = load <16 x i8>, ptr %b, align 1 14407ee9bd2SPhilip Reames %3 = zext <16 x i8> %2 to <16 x i32> 14507ee9bd2SPhilip Reames %4 = sub nsw <16 x i32> %1, %3 14607ee9bd2SPhilip Reames %5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true) 14707ee9bd2SPhilip Reames %6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 14807ee9bd2SPhilip Reames %add.ptr = getelementptr inbounds i8, ptr %a, i64 %idx.ext 14907ee9bd2SPhilip Reames %add.ptr9 = getelementptr inbounds i8, ptr %b, i64 %idx.ext8 15007ee9bd2SPhilip Reames %7 = load <16 x i8>, ptr %add.ptr, align 1 15107ee9bd2SPhilip Reames %8 = zext <16 x i8> %7 to <16 x i32> 15207ee9bd2SPhilip Reames %9 = load <16 x i8>, ptr %add.ptr9, align 1 15307ee9bd2SPhilip Reames %10 = zext <16 x i8> %9 to <16 x i32> 15407ee9bd2SPhilip Reames %11 = sub nsw <16 x i32> %8, %10 15507ee9bd2SPhilip Reames %12 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %11, i1 true) 15607ee9bd2SPhilip Reames %13 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12) 15707ee9bd2SPhilip Reames %op.rdx.1 = add i32 %13, %6 15807ee9bd2SPhilip Reames %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext 15907ee9bd2SPhilip Reames %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8 16007ee9bd2SPhilip Reames %14 = load <16 x i8>, ptr %add.ptr.1, align 1 16107ee9bd2SPhilip Reames %15 = zext <16 x i8> %14 to <16 x i32> 16207ee9bd2SPhilip Reames %16 = load <16 x i8>, ptr %add.ptr9.1, align 1 16307ee9bd2SPhilip Reames %17 = zext <16 x i8> %16 to <16 x i32> 16407ee9bd2SPhilip Reames %18 = sub nsw <16 x i32> %15, %17 16507ee9bd2SPhilip Reames %19 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %18, i1 true) 16607ee9bd2SPhilip Reames %20 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19) 16707ee9bd2SPhilip Reames %op.rdx.2 = add i32 %20, %op.rdx.1 16807ee9bd2SPhilip Reames %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext 16907ee9bd2SPhilip Reames %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8 17007ee9bd2SPhilip Reames %21 = load <16 x i8>, ptr %add.ptr.2, align 1 17107ee9bd2SPhilip Reames %22 = zext <16 x i8> %21 to <16 x i32> 17207ee9bd2SPhilip Reames %23 = load <16 x i8>, ptr %add.ptr9.2, align 1 17307ee9bd2SPhilip Reames %24 = zext <16 x i8> %23 to <16 x i32> 17407ee9bd2SPhilip Reames %25 = sub nsw <16 x i32> %22, %24 17507ee9bd2SPhilip Reames %26 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %25, i1 true) 17607ee9bd2SPhilip Reames %27 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %26) 17707ee9bd2SPhilip Reames %op.rdx.3 = add i32 %27, %op.rdx.2 17807ee9bd2SPhilip Reames ret i32 %op.rdx.3 17907ee9bd2SPhilip Reames} 18007ee9bd2SPhilip Reames 18107ee9bd2SPhilip Reamesdeclare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) 18207ee9bd2SPhilip Reamesdeclare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 18307ee9bd2SPhilip Reamesdeclare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) 18407ee9bd2SPhilip Reamesdeclare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 18507ee9bd2SPhilip Reames 18607ee9bd2SPhilip Reamesdeclare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) 18707ee9bd2SPhilip Reamesdeclare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 18807ee9bd2SPhilip Reamesdeclare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) 18907ee9bd2SPhilip Reamesdeclare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 190