1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty *
4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty * the following conditions:
11*0957b409SSimon J. Gerraty *
12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty *
15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty * SOFTWARE.
23*0957b409SSimon J. Gerraty */
24*0957b409SSimon J. Gerraty
25*0957b409SSimon J. Gerraty #include "inner.h"
26*0957b409SSimon J. Gerraty
27*0957b409SSimon J. Gerraty /*
28*0957b409SSimon J. Gerraty * Perform the inner processing of blocks for Poly1305. The accumulator
29*0957b409SSimon J. Gerraty * and the r key are provided as arrays of 26-bit words (these words
30*0957b409SSimon J. Gerraty * are allowed to have an extra bit, i.e. use 27 bits).
31*0957b409SSimon J. Gerraty *
32*0957b409SSimon J. Gerraty * On output, all accumulator words fit on 26 bits, except acc[1], which
33*0957b409SSimon J. Gerraty * may be slightly larger (but by a very small amount only).
34*0957b409SSimon J. Gerraty */
35*0957b409SSimon J. Gerraty static void
poly1305_inner(uint32_t * acc,const uint32_t * r,const void * data,size_t len)36*0957b409SSimon J. Gerraty poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len)
37*0957b409SSimon J. Gerraty {
38*0957b409SSimon J. Gerraty /*
39*0957b409SSimon J. Gerraty * Implementation notes: we split the 130-bit values into five
40*0957b409SSimon J. Gerraty * 26-bit words. This gives us some space for carries.
41*0957b409SSimon J. Gerraty *
42*0957b409SSimon J. Gerraty * This code is inspired from the public-domain code available
43*0957b409SSimon J. Gerraty * on:
44*0957b409SSimon J. Gerraty * https://github.com/floodyberry/poly1305-donna
45*0957b409SSimon J. Gerraty *
46*0957b409SSimon J. Gerraty * Since we compute modulo 2^130-5, the "upper words" become
47*0957b409SSimon J. Gerraty * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
48*0957b409SSimon J. Gerraty */
49*0957b409SSimon J. Gerraty const unsigned char *buf;
50*0957b409SSimon J. Gerraty uint32_t a0, a1, a2, a3, a4;
51*0957b409SSimon J. Gerraty uint32_t r0, r1, r2, r3, r4;
52*0957b409SSimon J. Gerraty uint32_t u1, u2, u3, u4;
53*0957b409SSimon J. Gerraty
54*0957b409SSimon J. Gerraty r0 = r[0];
55*0957b409SSimon J. Gerraty r1 = r[1];
56*0957b409SSimon J. Gerraty r2 = r[2];
57*0957b409SSimon J. Gerraty r3 = r[3];
58*0957b409SSimon J. Gerraty r4 = r[4];
59*0957b409SSimon J. Gerraty
60*0957b409SSimon J. Gerraty u1 = r1 * 5;
61*0957b409SSimon J. Gerraty u2 = r2 * 5;
62*0957b409SSimon J. Gerraty u3 = r3 * 5;
63*0957b409SSimon J. Gerraty u4 = r4 * 5;
64*0957b409SSimon J. Gerraty
65*0957b409SSimon J. Gerraty a0 = acc[0];
66*0957b409SSimon J. Gerraty a1 = acc[1];
67*0957b409SSimon J. Gerraty a2 = acc[2];
68*0957b409SSimon J. Gerraty a3 = acc[3];
69*0957b409SSimon J. Gerraty a4 = acc[4];
70*0957b409SSimon J. Gerraty
71*0957b409SSimon J. Gerraty buf = data;
72*0957b409SSimon J. Gerraty while (len > 0) {
73*0957b409SSimon J. Gerraty uint64_t w0, w1, w2, w3, w4;
74*0957b409SSimon J. Gerraty uint64_t c;
75*0957b409SSimon J. Gerraty unsigned char tmp[16];
76*0957b409SSimon J. Gerraty
77*0957b409SSimon J. Gerraty /*
78*0957b409SSimon J. Gerraty * If there is a partial block, right-pad it with zeros.
79*0957b409SSimon J. Gerraty */
80*0957b409SSimon J. Gerraty if (len < 16) {
81*0957b409SSimon J. Gerraty memset(tmp, 0, sizeof tmp);
82*0957b409SSimon J. Gerraty memcpy(tmp, buf, len);
83*0957b409SSimon J. Gerraty buf = tmp;
84*0957b409SSimon J. Gerraty len = 16;
85*0957b409SSimon J. Gerraty }
86*0957b409SSimon J. Gerraty
87*0957b409SSimon J. Gerraty /*
88*0957b409SSimon J. Gerraty * Decode next block and apply the "high bit"; that value
89*0957b409SSimon J. Gerraty * is added to the accumulator.
90*0957b409SSimon J. Gerraty */
91*0957b409SSimon J. Gerraty a0 += br_dec32le(buf) & 0x03FFFFFF;
92*0957b409SSimon J. Gerraty a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF;
93*0957b409SSimon J. Gerraty a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF;
94*0957b409SSimon J. Gerraty a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF;
95*0957b409SSimon J. Gerraty a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000;
96*0957b409SSimon J. Gerraty
97*0957b409SSimon J. Gerraty /*
98*0957b409SSimon J. Gerraty * Compute multiplication.
99*0957b409SSimon J. Gerraty */
100*0957b409SSimon J. Gerraty #define M(x, y) ((uint64_t)(x) * (uint64_t)(y))
101*0957b409SSimon J. Gerraty
102*0957b409SSimon J. Gerraty w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1);
103*0957b409SSimon J. Gerraty w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2);
104*0957b409SSimon J. Gerraty w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3);
105*0957b409SSimon J. Gerraty w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4);
106*0957b409SSimon J. Gerraty w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0);
107*0957b409SSimon J. Gerraty
108*0957b409SSimon J. Gerraty #undef M
109*0957b409SSimon J. Gerraty /*
110*0957b409SSimon J. Gerraty * Perform some (partial) modular reduction. This step is
111*0957b409SSimon J. Gerraty * enough to keep values in ranges such that there won't
112*0957b409SSimon J. Gerraty * be carry overflows. Most of the reduction was done in
113*0957b409SSimon J. Gerraty * the multiplication step (by using the 'u*' values, and
114*0957b409SSimon J. Gerraty * using the fact that 2^130 = -5 mod p); here we perform
115*0957b409SSimon J. Gerraty * some carry propagation.
116*0957b409SSimon J. Gerraty */
117*0957b409SSimon J. Gerraty c = w0 >> 26;
118*0957b409SSimon J. Gerraty a0 = (uint32_t)w0 & 0x3FFFFFF;
119*0957b409SSimon J. Gerraty w1 += c;
120*0957b409SSimon J. Gerraty c = w1 >> 26;
121*0957b409SSimon J. Gerraty a1 = (uint32_t)w1 & 0x3FFFFFF;
122*0957b409SSimon J. Gerraty w2 += c;
123*0957b409SSimon J. Gerraty c = w2 >> 26;
124*0957b409SSimon J. Gerraty a2 = (uint32_t)w2 & 0x3FFFFFF;
125*0957b409SSimon J. Gerraty w3 += c;
126*0957b409SSimon J. Gerraty c = w3 >> 26;
127*0957b409SSimon J. Gerraty a3 = (uint32_t)w3 & 0x3FFFFFF;
128*0957b409SSimon J. Gerraty w4 += c;
129*0957b409SSimon J. Gerraty c = w4 >> 26;
130*0957b409SSimon J. Gerraty a4 = (uint32_t)w4 & 0x3FFFFFF;
131*0957b409SSimon J. Gerraty a0 += (uint32_t)c * 5;
132*0957b409SSimon J. Gerraty a1 += a0 >> 26;
133*0957b409SSimon J. Gerraty a0 &= 0x3FFFFFF;
134*0957b409SSimon J. Gerraty
135*0957b409SSimon J. Gerraty buf += 16;
136*0957b409SSimon J. Gerraty len -= 16;
137*0957b409SSimon J. Gerraty }
138*0957b409SSimon J. Gerraty
139*0957b409SSimon J. Gerraty acc[0] = a0;
140*0957b409SSimon J. Gerraty acc[1] = a1;
141*0957b409SSimon J. Gerraty acc[2] = a2;
142*0957b409SSimon J. Gerraty acc[3] = a3;
143*0957b409SSimon J. Gerraty acc[4] = a4;
144*0957b409SSimon J. Gerraty }
145*0957b409SSimon J. Gerraty
146*0957b409SSimon J. Gerraty /* see bearssl_block.h */
147*0957b409SSimon J. Gerraty void
br_poly1305_ctmul_run(const void * key,const void * iv,void * data,size_t len,const void * aad,size_t aad_len,void * tag,br_chacha20_run ichacha,int encrypt)148*0957b409SSimon J. Gerraty br_poly1305_ctmul_run(const void *key, const void *iv,
149*0957b409SSimon J. Gerraty void *data, size_t len, const void *aad, size_t aad_len,
150*0957b409SSimon J. Gerraty void *tag, br_chacha20_run ichacha, int encrypt)
151*0957b409SSimon J. Gerraty {
152*0957b409SSimon J. Gerraty unsigned char pkey[32], foot[16];
153*0957b409SSimon J. Gerraty uint32_t r[5], acc[5], cc, ctl, hi;
154*0957b409SSimon J. Gerraty uint64_t w;
155*0957b409SSimon J. Gerraty int i;
156*0957b409SSimon J. Gerraty
157*0957b409SSimon J. Gerraty /*
158*0957b409SSimon J. Gerraty * Compute the MAC key. The 'r' value is the first 16 bytes of
159*0957b409SSimon J. Gerraty * pkey[].
160*0957b409SSimon J. Gerraty */
161*0957b409SSimon J. Gerraty memset(pkey, 0, sizeof pkey);
162*0957b409SSimon J. Gerraty ichacha(key, iv, 0, pkey, sizeof pkey);
163*0957b409SSimon J. Gerraty
164*0957b409SSimon J. Gerraty /*
165*0957b409SSimon J. Gerraty * If encrypting, ChaCha20 must run first, followed by Poly1305.
166*0957b409SSimon J. Gerraty * When decrypting, the operations are reversed.
167*0957b409SSimon J. Gerraty */
168*0957b409SSimon J. Gerraty if (encrypt) {
169*0957b409SSimon J. Gerraty ichacha(key, iv, 1, data, len);
170*0957b409SSimon J. Gerraty }
171*0957b409SSimon J. Gerraty
172*0957b409SSimon J. Gerraty /*
173*0957b409SSimon J. Gerraty * Run Poly1305. We must process the AAD, then ciphertext, then
174*0957b409SSimon J. Gerraty * the footer (with the lengths). Note that the AAD and ciphertext
175*0957b409SSimon J. Gerraty * are meant to be padded with zeros up to the next multiple of 16,
176*0957b409SSimon J. Gerraty * and the length of the footer is 16 bytes as well.
177*0957b409SSimon J. Gerraty */
178*0957b409SSimon J. Gerraty
179*0957b409SSimon J. Gerraty /*
180*0957b409SSimon J. Gerraty * Decode the 'r' value into 26-bit words, with the "clamping"
181*0957b409SSimon J. Gerraty * operation applied.
182*0957b409SSimon J. Gerraty */
183*0957b409SSimon J. Gerraty r[0] = br_dec32le(pkey) & 0x03FFFFFF;
184*0957b409SSimon J. Gerraty r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;
185*0957b409SSimon J. Gerraty r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;
186*0957b409SSimon J. Gerraty r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;
187*0957b409SSimon J. Gerraty r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
188*0957b409SSimon J. Gerraty
189*0957b409SSimon J. Gerraty /*
190*0957b409SSimon J. Gerraty * Accumulator is 0.
191*0957b409SSimon J. Gerraty */
192*0957b409SSimon J. Gerraty memset(acc, 0, sizeof acc);
193*0957b409SSimon J. Gerraty
194*0957b409SSimon J. Gerraty /*
195*0957b409SSimon J. Gerraty * Process the additional authenticated data, ciphertext, and
196*0957b409SSimon J. Gerraty * footer in due order.
197*0957b409SSimon J. Gerraty */
198*0957b409SSimon J. Gerraty br_enc64le(foot, (uint64_t)aad_len);
199*0957b409SSimon J. Gerraty br_enc64le(foot + 8, (uint64_t)len);
200*0957b409SSimon J. Gerraty poly1305_inner(acc, r, aad, aad_len);
201*0957b409SSimon J. Gerraty poly1305_inner(acc, r, data, len);
202*0957b409SSimon J. Gerraty poly1305_inner(acc, r, foot, sizeof foot);
203*0957b409SSimon J. Gerraty
204*0957b409SSimon J. Gerraty /*
205*0957b409SSimon J. Gerraty * Finalise modular reduction. This is done with carry propagation
206*0957b409SSimon J. Gerraty * and applying the '2^130 = -5 mod p' rule. Note that the output
207*0957b409SSimon J. Gerraty * of poly1035_inner() is already mostly reduced, since only
208*0957b409SSimon J. Gerraty * acc[1] may be (very slightly) above 2^26. A single loop back
209*0957b409SSimon J. Gerraty * to acc[1] will be enough to make the value fit in 130 bits.
210*0957b409SSimon J. Gerraty */
211*0957b409SSimon J. Gerraty cc = 0;
212*0957b409SSimon J. Gerraty for (i = 1; i <= 6; i ++) {
213*0957b409SSimon J. Gerraty int j;
214*0957b409SSimon J. Gerraty
215*0957b409SSimon J. Gerraty j = (i >= 5) ? i - 5 : i;
216*0957b409SSimon J. Gerraty acc[j] += cc;
217*0957b409SSimon J. Gerraty cc = acc[j] >> 26;
218*0957b409SSimon J. Gerraty acc[j] &= 0x03FFFFFF;
219*0957b409SSimon J. Gerraty }
220*0957b409SSimon J. Gerraty
221*0957b409SSimon J. Gerraty /*
222*0957b409SSimon J. Gerraty * We may still have a value in the 2^130-5..2^130-1 range, in
223*0957b409SSimon J. Gerraty * which case we must reduce it again. The code below selects,
224*0957b409SSimon J. Gerraty * in constant-time, between 'acc' and 'acc-p',
225*0957b409SSimon J. Gerraty */
226*0957b409SSimon J. Gerraty ctl = GT(acc[0], 0x03FFFFFA);
227*0957b409SSimon J. Gerraty for (i = 1; i < 5; i ++) {
228*0957b409SSimon J. Gerraty ctl &= EQ(acc[i], 0x03FFFFFF);
229*0957b409SSimon J. Gerraty }
230*0957b409SSimon J. Gerraty cc = 5;
231*0957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
232*0957b409SSimon J. Gerraty uint32_t t;
233*0957b409SSimon J. Gerraty
234*0957b409SSimon J. Gerraty t = (acc[i] + cc);
235*0957b409SSimon J. Gerraty cc = t >> 26;
236*0957b409SSimon J. Gerraty t &= 0x03FFFFFF;
237*0957b409SSimon J. Gerraty acc[i] = MUX(ctl, t, acc[i]);
238*0957b409SSimon J. Gerraty }
239*0957b409SSimon J. Gerraty
240*0957b409SSimon J. Gerraty /*
241*0957b409SSimon J. Gerraty * Convert back the accumulator to 32-bit words, and add the
242*0957b409SSimon J. Gerraty * 's' value (second half of pkey[]). That addition is done
243*0957b409SSimon J. Gerraty * modulo 2^128.
244*0957b409SSimon J. Gerraty */
245*0957b409SSimon J. Gerraty w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16);
246*0957b409SSimon J. Gerraty br_enc32le((unsigned char *)tag, (uint32_t)w);
247*0957b409SSimon J. Gerraty w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20);
248*0957b409SSimon J. Gerraty br_enc32le((unsigned char *)tag + 4, (uint32_t)w);
249*0957b409SSimon J. Gerraty w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24);
250*0957b409SSimon J. Gerraty br_enc32le((unsigned char *)tag + 8, (uint32_t)w);
251*0957b409SSimon J. Gerraty hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28);
252*0957b409SSimon J. Gerraty br_enc32le((unsigned char *)tag + 12, hi);
253*0957b409SSimon J. Gerraty
254*0957b409SSimon J. Gerraty /*
255*0957b409SSimon J. Gerraty * If decrypting, then ChaCha20 runs _after_ Poly1305.
256*0957b409SSimon J. Gerraty */
257*0957b409SSimon J. Gerraty if (!encrypt) {
258*0957b409SSimon J. Gerraty ichacha(key, iv, 1, data, len);
259*0957b409SSimon J. Gerraty }
260*0957b409SSimon J. Gerraty }
261