1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s 3; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s 4 5define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) { 6; CHECK-LABEL: sad_4x8_as_i16: 7; CHECK: # %bb.0: # %entry 8; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 9; CHECK-NEXT: vminu.vv v10, v8, v9 10; CHECK-NEXT: vmaxu.vv v8, v8, v9 11; CHECK-NEXT: vsub.vv v8, v8, v10 12; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 13; CHECK-NEXT: vmv.s.x v9, zero 14; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma 15; CHECK-NEXT: vwredsumu.vs v8, v8, v9 16; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 17; CHECK-NEXT: vmv.x.s a0, v8 18; CHECK-NEXT: ret 19entry: 20 %1 = zext <4 x i8> %a to <4 x i16> 21 %3 = zext <4 x i8> %b to <4 x i16> 22 %4 = sub nsw <4 x i16> %1, %3 23 %5 = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %4, i1 true) 24 %6 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %5) 25 ret i16 %6 26} 27 28define signext i32 @sad_4x8_as_i32(<4 x i8> %a, <4 x i8> %b) { 29; CHECK-LABEL: sad_4x8_as_i32: 30; CHECK: # %bb.0: # %entry 31; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 32; CHECK-NEXT: vminu.vv v10, v8, v9 33; CHECK-NEXT: vmaxu.vv v8, v8, v9 34; CHECK-NEXT: vsub.vv v8, v8, v10 35; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma 36; CHECK-NEXT: vzext.vf4 v9, v8 37; CHECK-NEXT: vmv.s.x v8, zero 38; CHECK-NEXT: vredsum.vs v8, v9, v8 39; CHECK-NEXT: vmv.x.s a0, v8 40; CHECK-NEXT: ret 41entry: 42 %1 = zext <4 x i8> %a to <4 x i32> 43 %3 = zext <4 x i8> %b to <4 x i32> 44 %4 = sub nsw <4 x i32> %1, %3 45 %5 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %4, i1 true) 46 %6 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 47 ret i32 %6 48} 49 50define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) { 51; CHECK-LABEL: sad_16x8_as_i16: 52; CHECK: # %bb.0: # %entry 53; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 54; CHECK-NEXT: vminu.vv v10, v8, v9 55; CHECK-NEXT: vmaxu.vv v8, v8, v9 56; CHECK-NEXT: vsub.vv v8, v8, v10 57; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 58; CHECK-NEXT: vmv.s.x v9, zero 59; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma 60; CHECK-NEXT: vwredsumu.vs v8, v8, v9 61; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 62; CHECK-NEXT: vmv.x.s a0, v8 63; CHECK-NEXT: ret 64entry: 65 %1 = zext <16 x i8> %a to <16 x i16> 66 %3 = zext <16 x i8> %b to <16 x i16> 67 %4 = sub nsw <16 x i16> %1, %3 68 %5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 true) 69 %6 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5) 70 ret i16 %6 71} 72 73define signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) { 74; CHECK-LABEL: sad_16x8_as_i32: 75; CHECK: # %bb.0: # %entry 76; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 77; CHECK-NEXT: vminu.vv v10, v8, v9 78; CHECK-NEXT: vmaxu.vv v8, v8, v9 79; CHECK-NEXT: vsub.vv v8, v8, v10 80; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma 81; CHECK-NEXT: vzext.vf4 v12, v8 82; CHECK-NEXT: vmv.s.x v8, zero 83; CHECK-NEXT: vredsum.vs v8, v12, v8 84; CHECK-NEXT: vmv.x.s a0, v8 85; CHECK-NEXT: ret 86entry: 87 %1 = zext <16 x i8> %a to <16 x i32> 88 %3 = zext <16 x i8> %b to <16 x i32> 89 %4 = sub nsw <16 x i32> %1, %3 90 %5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true) 91 %6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 92 ret i32 %6 93} 94 95define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea, i32 signext %strideb) { 96; CHECK-LABEL: sad_2block_16xi8_as_i32: 97; CHECK: # %bb.0: # %entry 98; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 99; CHECK-NEXT: vle8.v v8, (a0) 100; CHECK-NEXT: vle8.v v9, (a1) 101; CHECK-NEXT: add a0, a0, a2 102; CHECK-NEXT: add a1, a1, a3 103; CHECK-NEXT: vle8.v v10, (a0) 104; CHECK-NEXT: vle8.v v11, (a1) 105; CHECK-NEXT: add a0, a0, a2 106; CHECK-NEXT: add a1, a1, a3 107; CHECK-NEXT: vle8.v v12, (a0) 108; CHECK-NEXT: vle8.v v13, (a1) 109; CHECK-NEXT: add a0, a0, a2 110; CHECK-NEXT: add a1, a1, a3 111; CHECK-NEXT: vminu.vv v14, v8, v9 112; CHECK-NEXT: vmaxu.vv v8, v8, v9 113; CHECK-NEXT: vle8.v v9, (a0) 114; CHECK-NEXT: vsub.vv v8, v8, v14 115; CHECK-NEXT: vminu.vv v14, v10, v11 116; CHECK-NEXT: vmaxu.vv v10, v10, v11 117; CHECK-NEXT: vle8.v v11, (a1) 118; CHECK-NEXT: vsub.vv v10, v10, v14 119; CHECK-NEXT: vminu.vv v14, v12, v13 120; CHECK-NEXT: vmaxu.vv v12, v12, v13 121; CHECK-NEXT: vwaddu.vv v16, v10, v8 122; CHECK-NEXT: vsub.vv v8, v12, v14 123; CHECK-NEXT: vminu.vv v10, v9, v11 124; CHECK-NEXT: vmaxu.vv v9, v9, v11 125; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 126; CHECK-NEXT: vzext.vf2 v12, v8 127; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma 128; CHECK-NEXT: vsub.vv v8, v9, v10 129; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 130; CHECK-NEXT: vwaddu.vv v20, v12, v16 131; CHECK-NEXT: vzext.vf2 v10, v8 132; CHECK-NEXT: vwaddu.wv v20, v20, v10 133; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma 134; CHECK-NEXT: vmv.s.x v8, zero 135; CHECK-NEXT: vredsum.vs v8, v20, v8 136; CHECK-NEXT: vmv.x.s a0, v8 137; CHECK-NEXT: ret 138entry: 139 %idx.ext8 = sext i32 %strideb to i64 140 %idx.ext = sext i32 %stridea to i64 141 %0 = load <16 x i8>, ptr %a, align 1 142 %1 = zext <16 x i8> %0 to <16 x i32> 143 %2 = load <16 x i8>, ptr %b, align 1 144 %3 = zext <16 x i8> %2 to <16 x i32> 145 %4 = sub nsw <16 x i32> %1, %3 146 %5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true) 147 %6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 148 %add.ptr = getelementptr inbounds i8, ptr %a, i64 %idx.ext 149 %add.ptr9 = getelementptr inbounds i8, ptr %b, i64 %idx.ext8 150 %7 = load <16 x i8>, ptr %add.ptr, align 1 151 %8 = zext <16 x i8> %7 to <16 x i32> 152 %9 = load <16 x i8>, ptr %add.ptr9, align 1 153 %10 = zext <16 x i8> %9 to <16 x i32> 154 %11 = sub nsw <16 x i32> %8, %10 155 %12 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %11, i1 true) 156 %13 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12) 157 %op.rdx.1 = add i32 %13, %6 158 %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext 159 %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8 160 %14 = load <16 x i8>, ptr %add.ptr.1, align 1 161 %15 = zext <16 x i8> %14 to <16 x i32> 162 %16 = load <16 x i8>, ptr %add.ptr9.1, align 1 163 %17 = zext <16 x i8> %16 to <16 x i32> 164 %18 = sub nsw <16 x i32> %15, %17 165 %19 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %18, i1 true) 166 %20 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19) 167 %op.rdx.2 = add i32 %20, %op.rdx.1 168 %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext 169 %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8 170 %21 = load <16 x i8>, ptr %add.ptr.2, align 1 171 %22 = zext <16 x i8> %21 to <16 x i32> 172 %23 = load <16 x i8>, ptr %add.ptr9.2, align 1 173 %24 = zext <16 x i8> %23 to <16 x i32> 174 %25 = sub nsw <16 x i32> %22, %24 175 %26 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %25, i1 true) 176 %27 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %26) 177 %op.rdx.3 = add i32 %27, %op.rdx.2 178 ret i32 %op.rdx.3 179} 180 181declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) 182declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 183declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) 184declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 185 186declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) 187declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 188declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) 189declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 190