1 /* $NetBSD: chacha_neon.c,v 1.8 2020/08/08 14:47:01 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/endian.h> 31 32 #include "arm_neon.h" 33 #include "arm_neon_imm.h" 34 #include "chacha_neon.h" 35 36 /* 37 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice 38 * it hurts performance at least on Cortex-A8. 39 */ 40 #if 1 41 #define vrolq_n_u32(x, n) (vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n))) 42 #else 43 #define vrolq_n_u32(x, n) vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n)) 44 #endif 45 46 static inline uint32x4_t 47 rol16(uint32x4_t x) 48 { 49 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x); 50 51 y16 = vrev32q_u16(x16); 52 53 return vreinterpretq_u32_u16(y16); 54 } 55 56 static inline uint32x4_t 57 rol12(uint32x4_t x) 58 { 59 60 return vrolq_n_u32(x, 12); 61 } 62 63 static inline uint32x4_t 64 rol8(uint32x4_t x) 65 { 66 #if defined(__aarch64__) 67 static const uint8x16_t rol8_tab = VQ_N_U8( 68 3, 0, 1, 2, 7, 4, 5, 6, 69 11, 8, 9,10, 15,12,13,14 70 ); 71 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); 72 73 y8 = vqtbl1q_u8(x8, rol8_tab); 74 75 return vreinterpretq_u32_u8(y8); 76 #elif 0 77 /* 78 * GCC does a lousy job with this, spilling two 64-bit vector 79 * registers to the stack every time. There should be plenty 80 * of vector registers free, requiring no spills at all, and 81 * GCC should be able to hoist the load of rol8_tab out of any 82 * loops, but it doesn't and so attempting to use VTBL hurts 83 * more than it helps. 84 */ 85 static const uint8x8_t rol8_tab = V_N_U8( 86 3, 0, 1, 2, 7, 4, 5, 6 87 ); 88 89 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); 90 91 y64 = (uint64x2_t) { 92 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab), 93 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab), 94 }; 95 96 return vreinterpretq_u32_u64(y64); 97 #else 98 return vrolq_n_u32(x, 8); 99 #endif 100 } 101 102 static inline uint32x4_t 103 rol7(uint32x4_t x) 104 { 105 106 return vrolq_n_u32(x, 7); 107 } 108 109 static inline void 110 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, 111 unsigned nr) 112 { 113 uint32x4_t r0, r1, r2, r3; 114 uint32x4_t c0, c1, c2, c3; 115 116 r0 = *p0; 117 r1 = *p1; 118 r2 = *p2; 119 r3 = *p3; 120 121 for (; nr > 0; nr -= 2) { 122 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3); 123 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1); 124 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3); 125 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1); 126 127 c0 = r0; 128 c1 = vextq_u32(r1, r1, 1); 129 c2 = vextq_u32(r2, r2, 2); 130 c3 = vextq_u32(r3, r3, 3); 131 132 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3); 133 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1); 134 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3); 135 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1); 136 137 r0 = c0; 138 r1 = vextq_u32(c1, c1, 3); 139 r2 = vextq_u32(c2, c2, 2); 140 r3 = vextq_u32(c3, c3, 1); 141 } 142 143 *p0 = r0; 144 *p1 = r1; 145 *p2 = r2; 146 *p3 = r3; 147 } 148 149 void 150 chacha_core_neon(uint8_t out[restrict static 64], 151 const uint8_t in[static 16], 152 const uint8_t k[static 32], 153 const uint8_t c[static 16], 154 unsigned nr) 155 { 156 uint32x4_t in0, in1, in2, in3; 157 uint32x4_t r0, r1, r2, r3; 158 159 r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c)); 160 r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 161 r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 162 r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in)); 163 164 chacha_permute(&r0, &r1, &r2, &r3, nr); 165 166 vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0))); 167 vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1))); 168 vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2))); 169 vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3))); 170 } 171 172 void 173 hchacha_neon(uint8_t out[restrict static 32], 174 const uint8_t in[static 16], 175 const uint8_t k[static 32], 176 const uint8_t c[static 16], 177 unsigned nr) 178 { 179 uint32x4_t r0, r1, r2, r3; 180 181 r0 = vreinterpretq_u32_u8(vld1q_u8(c)); 182 r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 183 r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 184 r3 = vreinterpretq_u32_u8(vld1q_u8(in)); 185 186 chacha_permute(&r0, &r1, &r2, &r3, nr); 187 188 vst1q_u8(out + 0, vreinterpretq_u8_u32(r0)); 189 vst1q_u8(out + 16, vreinterpretq_u8_u32(r3)); 190 } 191 192 void 193 chacha_stream_neon(uint8_t *restrict s, size_t n, 194 uint32_t blkno, 195 const uint8_t nonce[static 12], 196 const uint8_t k[static 32], 197 unsigned nr) 198 { 199 200 for (; n >= 256; s += 256, n -= 256, blkno += 4) 201 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); 202 203 if (n) { 204 const uint32x4_t blkno_inc = /* (1,0,0,0) */ 205 vsetq_lane_u32(1, vdupq_n_u32(0), 0); 206 uint32x4_t in0, in1, in2, in3; 207 uint32x4_t r0, r1, r2, r3; 208 209 in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); 210 in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 211 in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 212 in3 = (uint32x4_t) VQ_N_U32( 213 blkno, 214 le32dec(nonce), 215 le32dec(nonce + 4), 216 le32dec(nonce + 8) 217 ); 218 219 for (; n; s += 64, n -= 64) { 220 r0 = in0; 221 r1 = in1; 222 r2 = in2; 223 r3 = in3; 224 chacha_permute(&r0, &r1, &r2, &r3, nr); 225 r0 = vaddq_u32(r0, in0); 226 r1 = vaddq_u32(r1, in1); 227 r2 = vaddq_u32(r2, in2); 228 r3 = vaddq_u32(r3, in3); 229 230 if (n < 64) { 231 uint8_t buf[64] __aligned(16); 232 233 vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); 234 vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); 235 vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); 236 vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); 237 memcpy(s, buf, n); 238 239 break; 240 } 241 242 vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); 243 vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); 244 vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); 245 vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); 246 in3 = vaddq_u32(in3, blkno_inc); 247 } 248 } 249 } 250 251 void 252 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, 253 uint32_t blkno, 254 const uint8_t nonce[static 12], 255 const uint8_t k[static 32], 256 unsigned nr) 257 { 258 259 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) 260 chacha_stream_xor256_neon(s, p, blkno, nonce, k, 261 chacha_const32, nr); 262 263 if (n) { 264 const uint32x4_t blkno_inc = /* (1,0,0,0) */ 265 vsetq_lane_u32(1, vdupq_n_u32(0), 0); 266 uint32x4_t in0, in1, in2, in3; 267 uint32x4_t r0, r1, r2, r3; 268 269 in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); 270 in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); 271 in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); 272 in3 = (uint32x4_t) VQ_N_U32( 273 blkno, 274 le32dec(nonce), 275 le32dec(nonce + 4), 276 le32dec(nonce + 8) 277 ); 278 279 for (; n; s += 64, p += 64, n -= 64) { 280 r0 = in0; 281 r1 = in1; 282 r2 = in2; 283 r3 = in3; 284 chacha_permute(&r0, &r1, &r2, &r3, nr); 285 r0 = vaddq_u32(r0, in0); 286 r1 = vaddq_u32(r1, in1); 287 r2 = vaddq_u32(r2, in2); 288 r3 = vaddq_u32(r3, in3); 289 290 if (n < 64) { 291 uint8_t buf[64] __aligned(16); 292 unsigned i; 293 294 vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); 295 vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); 296 vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); 297 vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); 298 299 for (i = 0; i < n - n%4; i += 4) 300 le32enc(s + i, 301 le32dec(p + i) ^ le32dec(buf + i)); 302 for (; i < n; i++) 303 s[i] = p[i] ^ buf[i]; 304 305 break; 306 } 307 308 r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0)); 309 r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16)); 310 r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32)); 311 r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48)); 312 vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); 313 vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); 314 vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); 315 vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); 316 in3 = vaddq_u32(in3, blkno_inc); 317 } 318 } 319 } 320 321 void 322 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, 323 uint32_t blkno, 324 const uint8_t nonce[static 24], 325 const uint8_t k[static 32], 326 unsigned nr) 327 { 328 uint8_t subkey[32]; 329 uint8_t subnonce[12]; 330 331 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 332 memset(subnonce, 0, 4); 333 memcpy(subnonce + 4, nonce + 16, 8); 334 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); 335 } 336 337 void 338 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 339 uint32_t blkno, 340 const uint8_t nonce[static 24], 341 const uint8_t k[static 32], 342 unsigned nr) 343 { 344 uint8_t subkey[32]; 345 uint8_t subnonce[12]; 346 347 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 348 memset(subnonce, 0, 4); 349 memcpy(subnonce + 4, nonce + 16, 8); 350 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); 351 } 352