1######################################################################## 2# Copyright(c) 2019 Arm Corporation All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# * Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in 11# the documentation and/or other materials provided with the 12# distribution. 13# * Neither the name of Arm Corporation nor the names of its 14# contributors may be used to endorse or promote products derived 15# from this software without specific prior written permission. 16# 17# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28######################################################################### 29 30#include "../include/aarch64_label.h" 31 32.text 33 34.global cdecl(xor_gen_neon) 35#ifndef __APPLE__ 36.type xor_gen_neon, %function 37#endif 38 39/* int xor_gen_neon(int vects, int len, void **src) */ 40 41/* arguments */ 42w_vects .req w0 /* MUST >= 2 */ 43x_vects .req x0 44w_len .req w1 45x_len .req x1 46x_src .req x2 47 48/* returns */ 49w_ret .req w0 50 51/* local variables */ 52w_in .req w1 /* share w_len */ 53x_src0 .req x3 54x_src0_end .req x4 55w_len256 .req w5 /* share w_len16, w_xor */ 56x_len256 .req x5 57w_len16 .req w5 58x_len16 .req x5 59w_xor .req w5 60w_col .req w6 61x_col .req x6 62x_src_ptr .req x7 63x_srcn .req x9 64x_dst .req x10 65x_dst_ptr .req x11 66/* v0 ~ v15: temporary results */ 67/* v16 ~ v31: next 256 bytes */ 68 69/* 70 * +----------+ +------------------+ 71 * src --> | src[0] | - src0 -> | buffer | src0_end 72 * --------+----------+ +------------------+ 73 * . | ...... | 74 * . +----------+ +------------------+ 75 * src_ptr ~~> | src[n] | - srcn ~> | buffer | 76 * . +----------+ +------------------+ 77 * . | ...... | 78 * . +----------+ 79 * . | src[v-2] | 80 * --------+----------+ +------------------+ 81 * dst_ptr --> | src[v-1] | -- dst --> | buffer | 82 * +----------+ +------------------+ 83 */ 84 85cdecl(xor_gen_neon): 86 add x_dst_ptr, x_src, x_vects, lsl #3 87 ldr x_dst, [x_dst_ptr, #-8]! 88 ldr x_src0, [x_src] 89 add x_src0_end, x_src0, x_len 90 91 sub w_vects, w_vects, #2 92 mov w_col, #0 93 94.Loop256_init: 95 /* len256 = len - len%256; len %= 256 */ 96 mov w_len256, w_len 97 and w_len, w_len, #0xFF 98 sub w_len256, w_len256, w_len 99 100 /* less than 256 byts? */ 101 cbz w_len256, .Lloop16_init 102 103 /* save d8 ~ d15 to stack */ 104 sub sp, sp, #64 105 stp d8, d9, [sp] 106 stp d10, d11, [sp, #16] 107 stp d12, d13, [sp, #32] 108 stp d14, d15, [sp, #48] 109 110 sub x_src0_end, x_src0_end, #256 111 112 /* batch process (vects-1)*256 bytes */ 113.Lloop256: 114 ldr q0, [x_src0, #16*0] 115 ldr q1, [x_src0, #16*1] 116 ldr q2, [x_src0, #16*2] 117 ldr q3, [x_src0, #16*3] 118 ldr q4, [x_src0, #16*4] 119 ldr q5, [x_src0, #16*5] 120 ldr q6, [x_src0, #16*6] 121 ldr q7, [x_src0, #16*7] 122 ldr q8, [x_src0, #16*8] 123 ldr q9, [x_src0, #16*9] 124 ldr q10, [x_src0, #16*10] 125 ldr q11, [x_src0, #16*11] 126 ldr q12, [x_src0, #16*12] 127 ldr q13, [x_src0, #16*13] 128 ldr q14, [x_src0, #16*14] 129 ldr q15, [x_src0, #16*15] 130 add x_src0, x_src0, #256 131 132 cbz w_vects, .Lloop256_vects_end 133 134 add x_src_ptr, x_src, #8 135.Lloop256_vects: 136 ldr x_srcn, [x_src_ptr], #8 137 add x_srcn, x_srcn, x_col 138 cmp x_src_ptr, x_dst_ptr 139 140 ldr q16, [x_srcn, #16*0] 141 ldr q17, [x_srcn, #16*1] 142 ldr q18, [x_srcn, #16*2] 143 ldr q19, [x_srcn, #16*3] 144 ldr q20, [x_srcn, #16*4] 145 ldr q21, [x_srcn, #16*5] 146 ldr q22, [x_srcn, #16*6] 147 ldr q23, [x_srcn, #16*7] 148 ldr q24, [x_srcn, #16*8] 149 ldr q25, [x_srcn, #16*9] 150 ldr q26, [x_srcn, #16*10] 151 ldr q27, [x_srcn, #16*11] 152 ldr q28, [x_srcn, #16*12] 153 ldr q29, [x_srcn, #16*13] 154 ldr q30, [x_srcn, #16*14] 155 ldr q31, [x_srcn, #16*15] 156 157 eor v0.16b, v0.16b, v16.16b 158 eor v1.16b, v1.16b, v17.16b 159 eor v2.16b, v2.16b, v18.16b 160 eor v3.16b, v3.16b, v19.16b 161 eor v4.16b, v4.16b, v20.16b 162 eor v5.16b, v5.16b, v21.16b 163 eor v6.16b, v6.16b, v22.16b 164 eor v7.16b, v7.16b, v23.16b 165 eor v8.16b, v8.16b, v24.16b 166 eor v9.16b, v9.16b, v25.16b 167 eor v10.16b, v10.16b, v26.16b 168 eor v11.16b, v11.16b, v27.16b 169 eor v12.16b, v12.16b, v28.16b 170 eor v13.16b, v13.16b, v29.16b 171 eor v14.16b, v14.16b, v30.16b 172 eor v15.16b, v15.16b, v31.16b 173 174 bne .Lloop256_vects 175 176.Lloop256_vects_end: 177 str q0, [x_dst, #16*0] 178 str q1, [x_dst, #16*1] 179 str q2, [x_dst, #16*2] 180 str q3, [x_dst, #16*3] 181 str q4, [x_dst, #16*4] 182 str q5, [x_dst, #16*5] 183 str q6, [x_dst, #16*6] 184 str q7, [x_dst, #16*7] 185 str q8, [x_dst, #16*8] 186 str q9, [x_dst, #16*9] 187 str q10, [x_dst, #16*10] 188 str q11, [x_dst, #16*11] 189 str q12, [x_dst, #16*12] 190 str q13, [x_dst, #16*13] 191 str q14, [x_dst, #16*14] 192 str q15, [x_dst, #16*15] 193 194 cmp x_src0, x_src0_end 195 add x_dst, x_dst, #256 196 add w_col, w_col, #256 197 bls .Lloop256 198 199.Lloop256_end: 200 /* restore d8 ~ d15 */ 201 ldp d8, d9, [sp] 202 ldp d10, d11, [sp, #16] 203 ldp d12, d13, [sp, #32] 204 ldp d14, d15, [sp, #48] 205 add sp, sp, #64 206 207 add x_src0_end, x_src0_end, #256 208 209.Lloop16_init: 210 /* len16 = len - len%16; len %= 16 */ 211 mov w_len16, w_len 212 and w_len, w_len, #0xF 213 sub w_len16, w_len16, w_len 214 215 /* less than 16 bytes? */ 216 cbz w_len16, .Lloop1_init 217 218 sub x_src0_end, x_src0_end, #16 219 220 /* batch process (vects-1)*16 bytes */ 221.Lloop16: 222 ldr q0, [x_src0], #16 223 cbz w_vects, .Lloop16_vects_end 224 225 add x_src_ptr, x_src, #8 226.Lloop16_vects: 227 ldr x_srcn, [x_src_ptr], #8 228 cmp x_src_ptr, x_dst_ptr 229 ldr q1, [x_srcn, x_col] 230 eor v0.16b, v0.16b, v1.16b 231 bne .Lloop16_vects 232 233.Lloop16_vects_end: 234 cmp x_src0, x_src0_end 235 str q0, [x_dst], #16 236 add w_col, w_col, #16 237 bls .Lloop16 238 239.Loop16_end: 240 add x_src0_end, x_src0_end, #16 241 242.Lloop1_init: 243 cbnz w_len, .Lloop1 244 mov w_ret, #0 245 ret 246 247 /* batch process (vects-1)*1 bytes */ 248.Lloop1: 249 ldrb w_xor, [x_src0], #1 250 cbz w_vects, .Lloop1_vects_end 251 252 add x_src_ptr, x_src, #8 253.Lloop1_vects: 254 ldr x_srcn, [x_src_ptr], #8 255 cmp x_src_ptr, x_dst_ptr 256 ldrb w_in, [x_srcn, x_col] 257 eor w_xor, w_xor, w_in 258 bne .Lloop1_vects 259 260.Lloop1_vects_end: 261 cmp x_src0, x_src0_end 262 strb w_xor, [x_dst], #1 263 add w_col, w_col, #1 264 bne .Lloop1 265 266.Loop1_end: 267 mov w_ret, #0 268 ret 269