1######################################################################## 2# Copyright(c) 2019 Arm Corporation All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# * Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in 11# the documentation and/or other materials provided with the 12# distribution. 13# * Neither the name of Arm Corporation nor the names of its 14# contributors may be used to endorse or promote products derived 15# from this software without specific prior written permission. 16# 17# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28######################################################################### 29 30.text 31 32.global pq_check_neon 33.type pq_check_neon, %function 34 35/* int pq_check_neon(int vects, int len, void **src) */ 36 37/* arguments */ 38w_vects .req w0 /* MUST >= 3 */ 39x_vects .req x0 40w_len .req w1 /* MUST be 16x bytes */ 41x_len .req x1 42x_src .req x2 43 44/* returns */ 45w_ret .req w0 46 47/* local variables */ 48x_dst_p .req x3 49x_dst_q .req x4 50x_dst_q_end .req x5 51w_col .req w6 52x_col .req x6 53x_src_ptr .req x7 54x_src_ptr_end .req x9 55x_src_last .req x10 56x_srcn .req x11 57w_min .req w12 58/* vectors */ 59/* v0 ~ v7 : temporary p */ 60/* v8 ~ v15: temporary q */ 61/* v16 ~ v23: next 128 bytes */ 62v_mask0 .req v24 63v_mask1 .req v25 64v_mask2 .req v26 65v_mask3 .req v27 66v_gf8poly .req v28 67v_0x80 .req v29 68 69/* 70 * src_ptr_end --> 71 * -------+----------+ 72 * . | src[0] | 73 * . +----------+ +------------------+ 74 * src_ptr --> | src[1] | - srcn -> | buffer | 75 * . +----------+ +------------------+ 76 * . | ...... | 77 * . +----------+ 78 * . | src[v-4] | 79 * -------+----------+ src_last +------------------+ 80 * src --> | src[v-3] | ---------> | buffer | 81 * +----------+ +------------------+ 82 * | src[v-2] | - dst_p -> | buffer | 83 * +----------+ +------------------+ 84 * | src[v-1] | - dst_q -> | buffer | dst_q_end 85 * +----------+ +------------------+ 86 */ 87 88pq_check_neon: 89 sub x_src_ptr_end, x_src, #8 90 91 sub w_vects, w_vects, #3 92 add x_src, x_src, x_vects, lsl #3 93 94 ldr x_src_last, [x_src] 95 ldp x_dst_p, x_dst_q, [x_src, #8] 96 97 add x_dst_q_end, x_dst_q, x_len 98 99 mov w_min, #-1 100 mov w_col, #0 101 movi v_gf8poly.16b, #0x1D 102 movi v_0x80.16b, #0x80 103 104.Lloop128_init: 105 /* less than 128 byts? */ 106 cmp w_len, #128 107 blo .Lloop16_init 108 109 /* save d8 ~ d15 to stack */ 110 sub sp, sp, #64 111 stp d8, d9, [sp] 112 stp d10, d11, [sp, #16] 113 stp d12, d13, [sp, #32] 114 stp d14, d15, [sp, #48] 115 116 sub x_dst_q_end, x_dst_q_end, #128 117 118 /* batch process (vects-2)*128 bytes */ 119 /* v0~v7: p; v8~v15: q; v16~v23: in */ 120.Lloop128: 121 ldr q0, [x_src_last, #16*0] 122 ldr q1, [x_src_last, #16*1] 123 ldr q2, [x_src_last, #16*2] 124 ldr q3, [x_src_last, #16*3] 125 ldr q4, [x_src_last, #16*4] 126 ldr q5, [x_src_last, #16*5] 127 ldr q6, [x_src_last, #16*6] 128 ldr q7, [x_src_last, #16*7] 129 add x_src_last, x_src_last, #128 130 131 mov v8.16b, v0.16b 132 mov v9.16b, v1.16b 133 mov v10.16b, v2.16b 134 mov v11.16b, v3.16b 135 mov v12.16b, v4.16b 136 mov v13.16b, v5.16b 137 mov v14.16b, v6.16b 138 mov v15.16b, v7.16b 139 140 cbz w_vects, .Lloop128_vects_end 141 142 sub x_src_ptr, x_src, #8 143.Lloop128_vects: 144 ldr x_srcn, [x_src_ptr], #-8 145 add x_srcn, x_srcn, x_col 146 cmp x_src_ptr, x_src_ptr_end 147 148 ldr q16, [x_srcn, #16*0] 149 ldr q17, [x_srcn, #16*1] 150 ldr q18, [x_srcn, #16*2] 151 ldr q19, [x_srcn, #16*3] 152 ldr q20, [x_srcn, #16*4] 153 ldr q21, [x_srcn, #16*5] 154 ldr q22, [x_srcn, #16*6] 155 ldr q23, [x_srcn, #16*7] 156 157 eor v0.16b, v0.16b, v16.16b 158 eor v1.16b, v1.16b, v17.16b 159 eor v2.16b, v2.16b, v18.16b 160 eor v3.16b, v3.16b, v19.16b 161 eor v4.16b, v4.16b, v20.16b 162 eor v5.16b, v5.16b, v21.16b 163 eor v6.16b, v6.16b, v22.16b 164 eor v7.16b, v7.16b, v23.16b 165 166 cmhs v_mask0.16b, v8.16b, v_0x80.16b 167 cmhs v_mask1.16b, v9.16b, v_0x80.16b 168 cmhs v_mask2.16b, v10.16b, v_0x80.16b 169 cmhs v_mask3.16b, v11.16b, v_0x80.16b 170 and v_mask0.16b, v_mask0.16b, v_gf8poly.16b 171 and v_mask1.16b, v_mask1.16b, v_gf8poly.16b 172 and v_mask2.16b, v_mask2.16b, v_gf8poly.16b 173 and v_mask3.16b, v_mask3.16b, v_gf8poly.16b 174 shl v8.16b, v8.16b, #1 175 shl v9.16b, v9.16b, #1 176 shl v10.16b, v10.16b, #1 177 shl v11.16b, v11.16b, #1 178 eor v8.16b, v8.16b, v_mask0.16b 179 eor v9.16b, v9.16b, v_mask1.16b 180 eor v10.16b, v10.16b, v_mask2.16b 181 eor v11.16b, v11.16b, v_mask3.16b 182 eor v8.16b, v8.16b, v16.16b 183 eor v9.16b, v9.16b, v17.16b 184 eor v10.16b, v10.16b, v18.16b 185 eor v11.16b, v11.16b, v19.16b 186 187 cmhs v_mask0.16b, v12.16b, v_0x80.16b 188 cmhs v_mask1.16b, v13.16b, v_0x80.16b 189 cmhs v_mask2.16b, v14.16b, v_0x80.16b 190 cmhs v_mask3.16b, v15.16b, v_0x80.16b 191 and v_mask0.16b, v_mask0.16b, v_gf8poly.16b 192 and v_mask1.16b, v_mask1.16b, v_gf8poly.16b 193 and v_mask2.16b, v_mask2.16b, v_gf8poly.16b 194 and v_mask3.16b, v_mask3.16b, v_gf8poly.16b 195 shl v12.16b, v12.16b, #1 196 shl v13.16b, v13.16b, #1 197 shl v14.16b, v14.16b, #1 198 shl v15.16b, v15.16b, #1 199 eor v12.16b, v12.16b, v_mask0.16b 200 eor v13.16b, v13.16b, v_mask1.16b 201 eor v14.16b, v14.16b, v_mask2.16b 202 eor v15.16b, v15.16b, v_mask3.16b 203 eor v12.16b, v12.16b, v20.16b 204 eor v13.16b, v13.16b, v21.16b 205 eor v14.16b, v14.16b, v22.16b 206 eor v15.16b, v15.16b, v23.16b 207 208 bne .Lloop128_vects 209 210.Lloop128_vects_end: 211 /* v16~v23: true p, q */ 212 ldr q16, [x_dst_p, #16*0] 213 ldr q17, [x_dst_p, #16*1] 214 ldr q18, [x_dst_p, #16*2] 215 ldr q19, [x_dst_p, #16*3] 216 ldr q20, [x_dst_p, #16*4] 217 ldr q21, [x_dst_p, #16*5] 218 ldr q22, [x_dst_p, #16*6] 219 ldr q23, [x_dst_p, #16*7] 220 221 cmeq v0.16b, v0.16b, v16.16b 222 cmeq v1.16b, v1.16b, v17.16b 223 cmeq v2.16b, v2.16b, v18.16b 224 cmeq v3.16b, v3.16b, v19.16b 225 cmeq v4.16b, v4.16b, v20.16b 226 cmeq v5.16b, v5.16b, v21.16b 227 cmeq v6.16b, v6.16b, v22.16b 228 cmeq v7.16b, v7.16b, v23.16b 229 230 ldr q16, [x_dst_q, #16*0] 231 ldr q17, [x_dst_q, #16*1] 232 ldr q18, [x_dst_q, #16*2] 233 ldr q19, [x_dst_q, #16*3] 234 ldr q20, [x_dst_q, #16*4] 235 ldr q21, [x_dst_q, #16*5] 236 ldr q22, [x_dst_q, #16*6] 237 ldr q23, [x_dst_q, #16*7] 238 239 and v0.16b, v0.16b, v1.16b 240 and v2.16b, v2.16b, v3.16b 241 and v4.16b, v4.16b, v5.16b 242 and v6.16b, v6.16b, v7.16b 243 and v0.16b, v0.16b, v2.16b 244 and v4.16b, v4.16b, v6.16b 245 and v0.16b, v0.16b, v4.16b 246 247 cmeq v8.16b, v8.16b, v16.16b 248 cmeq v9.16b, v9.16b, v17.16b 249 cmeq v10.16b, v10.16b, v18.16b 250 cmeq v11.16b, v11.16b, v19.16b 251 cmeq v12.16b, v12.16b, v20.16b 252 cmeq v13.16b, v13.16b, v21.16b 253 cmeq v14.16b, v14.16b, v22.16b 254 cmeq v15.16b, v15.16b, v23.16b 255 256 and v8.16b, v8.16b, v9.16b 257 and v10.16b, v10.16b, v11.16b 258 and v12.16b, v12.16b, v13.16b 259 and v14.16b, v14.16b, v15.16b 260 and v8.16b, v8.16b, v10.16b 261 and v12.16b, v12.16b, v14.16b 262 and v8.16b, v8.16b, v12.16b 263 264 and v0.16b, v0.16b, v8.16b 265 266 uminv b0, v0.16b 267 umov w_min, v0.b[0] 268 cbz w_min, .Lloop128_end 269 270 add x_dst_p, x_dst_p, #128 271 add x_dst_q, x_dst_q, #128 272 cmp x_dst_q, x_dst_q_end 273 add w_col, w_col, #128 274 bls .Lloop128 275 276.Lloop128_end: 277 /* restore d8 ~ d15 */ 278 ldp d8, d9, [sp] 279 ldp d10, d11, [sp, #16] 280 ldp d12, d13, [sp, #32] 281 ldp d14, d15, [sp, #48] 282 add sp, sp, #64 283 284 cbz w_min, .Lerror 285 286 add x_dst_q_end, x_dst_q_end, #128 287 288.Lloop16_init: 289 tst w_len, #0x7F 290 beq .Lloop16_end 291 sub x_dst_q_end, x_dst_q_end, #16 292 293 /* batch process (vects-2)*16 bytes */ 294 /* v0: p; v1: q; v2: in; v3: mask */ 295.Lloop16: 296 ldr q0, [x_src_last], #16 297 mov v1.16b, v0.16b 298 299 cbz w_vects, .Lloop16_vects_end 300 301 sub x_src_ptr, x_src, #8 302.Lloop16_vects: 303 ldr x_srcn, [x_src_ptr], #-8 304 ldr q2, [x_srcn, x_col] 305 cmp x_src_ptr, x_src_ptr_end 306 307 eor v0.16b, v0.16b, v2.16b 308 309 cmhs v3.16b, v1.16b, v_0x80.16b 310 and v3.16b, v3.16b, v_gf8poly.16b 311 312 shl v1.16b, v1.16b, #1 313 eor v1.16b, v1.16b, v2.16b 314 eor v1.16b, v1.16b, v3.16b 315 316 bne .Lloop16_vects 317 318.Lloop16_vects_end: 319 /* v4: true p; v5: true q */ 320 ldr q4, [x_dst_p], #16 321 ldr q5, [x_dst_q], #16 322 cmp x_dst_q, x_dst_q_end 323 324 cmeq v0.16b, v0.16b, v4.16b 325 cmeq v1.16b, v1.16b, v5.16b 326 and v0.16b, v0.16b, v1.16b 327 328 uminv b0, v0.16b 329 umov w_min, v0.b[0] 330 cbz w_min, .Lerror 331 332 add w_col, w_col, #16 333 bls .Lloop16 334 335.Lloop16_end: 336 mov w_ret, #0 337 ret 338 339.Lerror: 340 mov w_ret, #1 341 ret 342