xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s
3; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
4
5define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) {
6; CHECK-LABEL: sad_4x8_as_i16:
7; CHECK:       # %bb.0: # %entry
8; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
9; CHECK-NEXT:    vminu.vv v10, v8, v9
10; CHECK-NEXT:    vmaxu.vv v8, v8, v9
11; CHECK-NEXT:    vsub.vv v8, v8, v10
12; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
13; CHECK-NEXT:    vmv.s.x v9, zero
14; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
15; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
16; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
17; CHECK-NEXT:    vmv.x.s a0, v8
18; CHECK-NEXT:    ret
19entry:
20  %1 = zext <4 x i8> %a to <4 x i16>
21  %3 = zext <4 x i8> %b to <4 x i16>
22  %4 = sub nsw <4 x i16> %1, %3
23  %5 = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %4, i1 true)
24  %6 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %5)
25  ret i16 %6
26}
27
28define signext i32 @sad_4x8_as_i32(<4 x i8> %a, <4 x i8> %b) {
29; CHECK-LABEL: sad_4x8_as_i32:
30; CHECK:       # %bb.0: # %entry
31; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
32; CHECK-NEXT:    vminu.vv v10, v8, v9
33; CHECK-NEXT:    vmaxu.vv v8, v8, v9
34; CHECK-NEXT:    vsub.vv v8, v8, v10
35; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
36; CHECK-NEXT:    vzext.vf4 v9, v8
37; CHECK-NEXT:    vmv.s.x v8, zero
38; CHECK-NEXT:    vredsum.vs v8, v9, v8
39; CHECK-NEXT:    vmv.x.s a0, v8
40; CHECK-NEXT:    ret
41entry:
42  %1 = zext <4 x i8> %a to <4 x i32>
43  %3 = zext <4 x i8> %b to <4 x i32>
44  %4 = sub nsw <4 x i32> %1, %3
45  %5 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %4, i1 true)
46  %6 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
47  ret i32 %6
48}
49
50define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) {
51; CHECK-LABEL: sad_16x8_as_i16:
52; CHECK:       # %bb.0: # %entry
53; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
54; CHECK-NEXT:    vminu.vv v10, v8, v9
55; CHECK-NEXT:    vmaxu.vv v8, v8, v9
56; CHECK-NEXT:    vsub.vv v8, v8, v10
57; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
58; CHECK-NEXT:    vmv.s.x v9, zero
59; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
60; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
61; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
62; CHECK-NEXT:    vmv.x.s a0, v8
63; CHECK-NEXT:    ret
64entry:
65  %1 = zext <16 x i8> %a to <16 x i16>
66  %3 = zext <16 x i8> %b to <16 x i16>
67  %4 = sub nsw <16 x i16> %1, %3
68  %5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 true)
69  %6 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5)
70  ret i16 %6
71}
72
73define signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) {
74; CHECK-LABEL: sad_16x8_as_i32:
75; CHECK:       # %bb.0: # %entry
76; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
77; CHECK-NEXT:    vminu.vv v10, v8, v9
78; CHECK-NEXT:    vmaxu.vv v8, v8, v9
79; CHECK-NEXT:    vsub.vv v8, v8, v10
80; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
81; CHECK-NEXT:    vzext.vf4 v12, v8
82; CHECK-NEXT:    vmv.s.x v8, zero
83; CHECK-NEXT:    vredsum.vs v8, v12, v8
84; CHECK-NEXT:    vmv.x.s a0, v8
85; CHECK-NEXT:    ret
86entry:
87  %1 = zext <16 x i8> %a to <16 x i32>
88  %3 = zext <16 x i8> %b to <16 x i32>
89  %4 = sub nsw <16 x i32> %1, %3
90  %5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true)
91  %6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
92  ret i32 %6
93}
94
95define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea, i32 signext %strideb) {
96; CHECK-LABEL: sad_2block_16xi8_as_i32:
97; CHECK:       # %bb.0: # %entry
98; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
99; CHECK-NEXT:    vle8.v v8, (a0)
100; CHECK-NEXT:    vle8.v v9, (a1)
101; CHECK-NEXT:    add a0, a0, a2
102; CHECK-NEXT:    add a1, a1, a3
103; CHECK-NEXT:    vle8.v v10, (a0)
104; CHECK-NEXT:    vle8.v v11, (a1)
105; CHECK-NEXT:    add a0, a0, a2
106; CHECK-NEXT:    add a1, a1, a3
107; CHECK-NEXT:    vle8.v v12, (a0)
108; CHECK-NEXT:    vle8.v v13, (a1)
109; CHECK-NEXT:    add a0, a0, a2
110; CHECK-NEXT:    add a1, a1, a3
111; CHECK-NEXT:    vminu.vv v14, v8, v9
112; CHECK-NEXT:    vmaxu.vv v8, v8, v9
113; CHECK-NEXT:    vle8.v v9, (a0)
114; CHECK-NEXT:    vsub.vv v8, v8, v14
115; CHECK-NEXT:    vminu.vv v14, v10, v11
116; CHECK-NEXT:    vmaxu.vv v10, v10, v11
117; CHECK-NEXT:    vle8.v v11, (a1)
118; CHECK-NEXT:    vsub.vv v10, v10, v14
119; CHECK-NEXT:    vminu.vv v14, v12, v13
120; CHECK-NEXT:    vmaxu.vv v12, v12, v13
121; CHECK-NEXT:    vwaddu.vv v16, v10, v8
122; CHECK-NEXT:    vsub.vv v8, v12, v14
123; CHECK-NEXT:    vminu.vv v10, v9, v11
124; CHECK-NEXT:    vmaxu.vv v9, v9, v11
125; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
126; CHECK-NEXT:    vzext.vf2 v12, v8
127; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
128; CHECK-NEXT:    vsub.vv v8, v9, v10
129; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
130; CHECK-NEXT:    vwaddu.vv v20, v12, v16
131; CHECK-NEXT:    vzext.vf2 v10, v8
132; CHECK-NEXT:    vwaddu.wv v20, v20, v10
133; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
134; CHECK-NEXT:    vmv.s.x v8, zero
135; CHECK-NEXT:    vredsum.vs v8, v20, v8
136; CHECK-NEXT:    vmv.x.s a0, v8
137; CHECK-NEXT:    ret
138entry:
139  %idx.ext8 = sext i32 %strideb to i64
140  %idx.ext = sext i32 %stridea to i64
141  %0 = load <16 x i8>, ptr %a, align 1
142  %1 = zext <16 x i8> %0 to <16 x i32>
143  %2 = load <16 x i8>, ptr %b, align 1
144  %3 = zext <16 x i8> %2 to <16 x i32>
145  %4 = sub nsw <16 x i32> %1, %3
146  %5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true)
147  %6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
148  %add.ptr = getelementptr inbounds i8, ptr %a, i64 %idx.ext
149  %add.ptr9 = getelementptr inbounds i8, ptr %b, i64 %idx.ext8
150  %7 = load <16 x i8>, ptr %add.ptr, align 1
151  %8 = zext <16 x i8> %7 to <16 x i32>
152  %9 = load <16 x i8>, ptr %add.ptr9, align 1
153  %10 = zext <16 x i8> %9 to <16 x i32>
154  %11 = sub nsw <16 x i32> %8, %10
155  %12 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %11, i1 true)
156  %13 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
157  %op.rdx.1 = add i32 %13, %6
158  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
159  %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
160  %14 = load <16 x i8>, ptr %add.ptr.1, align 1
161  %15 = zext <16 x i8> %14 to <16 x i32>
162  %16 = load <16 x i8>, ptr %add.ptr9.1, align 1
163  %17 = zext <16 x i8> %16 to <16 x i32>
164  %18 = sub nsw <16 x i32> %15, %17
165  %19 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %18, i1 true)
166  %20 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19)
167  %op.rdx.2 = add i32 %20, %op.rdx.1
168  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
169  %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
170  %21 = load <16 x i8>, ptr %add.ptr.2, align 1
171  %22 = zext <16 x i8> %21 to <16 x i32>
172  %23 = load <16 x i8>, ptr %add.ptr9.2, align 1
173  %24 = zext <16 x i8> %23 to <16 x i32>
174  %25 = sub nsw <16 x i32> %22, %24
175  %26 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %25, i1 true)
176  %27 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %26)
177  %op.rdx.3 = add i32 %27, %op.rdx.2
178  ret i32 %op.rdx.3
179}
180
181declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
182declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
183declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
184declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
185
186declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
187declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
188declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
189declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
190