Upgrade bearssl to acc70b1be60a6f321e2da618cd35d901b1a598a4

2020-02-07 12:30:46 -08:00 · 2020-02-07 12:30:46 -08:00 · 089f842558
commit 089f842558
parent 1af9d2022b
48 changed files with 7091 additions and 132 deletions
--- a/src/SSLClient.cpp
+++ b/src/SSLClient.cpp
@ -392,6 +392,13 @@ int SSLClient::m_run_until(const unsigned target) {
        unsigned state = m_update_engine();
 	// error check
        if (state == BR_SSL_CLOSED || getWriteError() != SSL_OK) {
+            if (state == BR_SSL_CLOSED) {
+                m_warn("Terminating because the ssl engine closed", func_name);
+            }
+            else {
+                m_warn("Terminating with write error: ", func_name);
+                m_warn(getWriteError(), func_name);
+            }
            return -1;
        }
        // timeout check
@ -406,7 +413,7 @@ int SSLClient::m_run_until(const unsigned target) {
            lastState = state;
            m_info("m_run changed state:", func_name);
            if(m_debug == DebugLevel::SSL_INFO) {
-                m_info("State: ", __func__);
+                m_info("State: ", func_name);
                if(state == 0) Serial.println("    Invalid");
                else if (state & BR_SSL_CLOSED) Serial.println("   Connection closed");
                else {
@ -728,6 +735,6 @@ void SSLClient::m_print_br_error(const unsigned br_error_code, const DebugLevel
    case BR_ERR_X509_FORBIDDEN_KEY_USAGE: Serial.println("Key Usage extension prohibits intended usage."); break;
    case BR_ERR_X509_WEAK_PUBLIC_KEY: Serial.println("Public key found in certificate is too small."); break;
    case BR_ERR_X509_NOT_TRUSTED: Serial.println("Chain could not be linked to a trust anchor."); break;
-    default: Serial.println("Unknown error code."); break;
+    default: Serial.print("Unknown error code: "); Serial.println(br_error_code); break;
  }
 }
--- a/src/bearssl/src/ec/ec_all_m31.c
+++ b/src/bearssl/src/ec/ec_all_m31.c
@ -29,9 +29,17 @@ api_generator(int curve, size_t *len)
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.generator(curve, len);
+#else
 		return br_ec_p256_m31.generator(curve, len);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.generator(curve, len);
+#else
 		return br_ec_c25519_m31.generator(curve, len);
+#endif
 	default:
 		return br_ec_prime_i31.generator(curve, len);
 	}
@ -42,9 +50,17 @@ api_order(int curve, size_t *len)
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.order(curve, len);
+#else
 		return br_ec_p256_m31.order(curve, len);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.order(curve, len);
+#else
 		return br_ec_c25519_m31.order(curve, len);
+#endif
 	default:
 		return br_ec_prime_i31.order(curve, len);
 	}
@ -55,9 +71,17 @@ api_xoff(int curve, size_t *len)
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.xoff(curve, len);
+#else
 		return br_ec_p256_m31.xoff(curve, len);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.xoff(curve, len);
+#else
 		return br_ec_c25519_m31.xoff(curve, len);
+#endif
 	default:
 		return br_ec_prime_i31.xoff(curve, len);
 	}
@ -69,9 +93,17 @@ api_mul(unsigned char *G, size_t Glen,
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.mul(G, Glen, kb, kblen, curve);
+#else
 		return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.mul(G, Glen, kb, kblen, curve);
+#else
 		return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve);
+#endif
 	default:
 		return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve);
 	}
@ -83,9 +115,17 @@ api_mulgen(unsigned char *R,
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.mulgen(R, x, xlen, curve);
+#else
 		return br_ec_p256_m31.mulgen(R, x, xlen, curve);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.mulgen(R, x, xlen, curve);
+#else
 		return br_ec_c25519_m31.mulgen(R, x, xlen, curve);
+#endif
 	default:
 		return br_ec_prime_i31.mulgen(R, x, xlen, curve);
 	}
@ -98,11 +138,21 @@ api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#else
 		return br_ec_p256_m31.muladd(A, B, len,
 			x, xlen, y, ylen, curve);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#else
 		return br_ec_c25519_m31.muladd(A, B, len,
 			x, xlen, y, ylen, curve);
+#endif
 	default:
 		return br_ec_prime_i31.muladd(A, B, len,
 			x, xlen, y, ylen, curve);
--- a/src/bearssl/src/ec/ec_c25519_i15.c
+++ b/src/bearssl/src/ec/ec_c25519_i15.c
@ -239,11 +239,11 @@ api_mul(unsigned char *G, size_t Glen,
 	x2[1] = 19;
 	memcpy(z3, x2, ILEN);

-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;

 	/* obsolete
 	print_int_mont("x1", x1);
@ -253,7 +253,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;

-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
--- a/src/bearssl/src/ec/ec_c25519_i31.c
+++ b/src/bearssl/src/ec/ec_c25519_i31.c
@ -214,7 +214,7 @@ api_mul(unsigned char *G, size_t Glen,
 	 *    br_i31_decode_reduce(a, G, 32, C255_P);
 	 */
 	br_i31_zero(b, 0x108);
-	b[9] = 0x0100;
+	b[9] = 0x0080;
 	br_i31_decode_mod(a, G, 32, b);
 	a[0] = 0x107;
 	br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0)));
@ -230,11 +230,14 @@ api_mul(unsigned char *G, size_t Glen,
 	x2[1] = 0x13000000;
 	memcpy(z3, x2, sizeof x2);

-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	/*
+	 * kb[] is in big-endian notation, but possibly shorter than k[].
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;

 	/* obsolete
 	print_int_mont("x1", x1);
@ -244,7 +247,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;

-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
--- a/src/bearssl/src/ec/ec_c25519_m15.c
+++ b/src/bearssl/src/ec/ec_c25519_m15.c
@ -1332,11 +1332,11 @@ api_mul(unsigned char *G, size_t Glen,
 	memset(z3, 0, sizeof z3);
 	z3[0] = 1;

-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;

 	/* obsolete
 	print_int("x1", x1);
@ -1346,7 +1346,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;

-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
--- a/src/bearssl/src/ec/ec_c25519_m31.c
+++ b/src/bearssl/src/ec/ec_c25519_m31.c
@ -372,8 +372,7 @@ reduce_final_f255(uint32_t *d)
 static void
 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 {
-	uint32_t t[18];
-	uint64_t cc, w;
+	uint32_t t[18], cc;
 	int i;

 	/*
@ -389,21 +388,42 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	 * offset 9*30 = 270, word 9+k must be added to word k with
 	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
 	 * added that way.
+	 *
+	 * Keeping the carry on 32 bits helps with 32-bit architectures,
+	 * and does not noticeably impact performance on 64-bit systems.
 	 */
-	cc = MUL31(t[8] >> 15, 19);
+	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+		uint64_t w;
+
+		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
 		t[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		cc = (uint32_t)(w >> 30);  /* at most 622592 */
 	}
-	cc = MUL31(w >> 15, 19);
+
+	/*
+	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
+	 * This was split into two parts (upper of 257 bits, lower of 255
+	 * bits), and the upper was added to the lower with a factor 19,
+	 * which means that the intermediate value is less than 77*2^255
+	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
+	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
+	 */
+	cc = MUL15(t[8] >> 15, 19);
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = t[i] + cc;
-		d[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		uint32_t z;
+
+		z = t[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
 	}
+
+	/*
+	 * Final result is at most 2^255 + 1443. In particular, the last
+	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
+	 */
 }

 /*
@ -415,8 +435,7 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 static void
 f255_square(uint32_t *d, const uint32_t *a)
 {
-	uint32_t t[18];
-	uint64_t cc, w;
+	uint32_t t[18], cc;
 	int i;

 	/*
@ -428,24 +447,25 @@ f255_square(uint32_t *d, const uint32_t *a)

 	/*
 	 * Modular reduction: each high word is added where necessary.
-	 * Since the modulus is 2^255-19 and word 9 corresponds to
-	 * offset 9*30 = 270, word 9+k must be added to word k with
-	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
-	 * added that way.
+	 * See f255_mul() for details on the reduction and carry limits.
 	 */
-	cc = MUL31(t[8] >> 15, 19);
+	cc = MUL15(t[8] >> 15, 19);
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+		uint64_t w;
+
+		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
 		t[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		cc = (uint32_t)(w >> 30);
 	}
-	cc = MUL31(w >> 15, 19);
+	cc = MUL15(t[8] >> 15, 19);
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = t[i] + cc;
-		d[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		uint32_t z;
+
+		z = t[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
 	}
 }

@ -515,20 +535,31 @@ static void
 f255_mul_a24(uint32_t *d, const uint32_t *a)
 {
 	int i;
-	uint64_t cc, w;
+	uint64_t w;
+	uint32_t cc;

+	/*
+	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
+	 * We single out the processing of the last word: intermediate
+	 * value w is up to 121665*2^16, yielding a carry for the next
+	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
+	 */
 	cc = 0;
-	for (i = 0; i < 9; i ++) {
-		w = MUL31(a[i], 121665) + cc;
+	for (i = 0; i < 8; i ++) {
+		w = MUL31(a[i], 121665) + (uint64_t)cc;
 		d[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		cc = (uint32_t)(w >> 30);
 	}
-	cc = MUL31((uint32_t)(w >> 15), 19);
-	d[8] &= 0x7FFF;
+	w = MUL31(a[8], 121665) + (uint64_t)cc;
+	d[8] = (uint32_t)w & 0x7FFF;
+	cc = MUL15((uint32_t)(w >> 15), 19);
+
 	for (i = 0; i < 9; i ++) {
-		w = (uint64_t)d[i] + cc;
-		d[i] = w & 0x3FFFFFFF;
-		cc = w >> 30;
+		uint32_t z;
+
+		z = d[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
 	}
 }

@ -623,11 +654,11 @@ api_mul(unsigned char *G, size_t Glen,
 	memset(z3, 0, sizeof z3);
 	z3[0] = 1;

-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;

 	/* obsolete
 	print_int("x1", x1);
@ -637,7 +668,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;

-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
--- a/src/bearssl/src/ec/ec_c25519_m62.c
+++ b/src/bearssl/src/ec/ec_c25519_m62.c
@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^51.
+ * Limbs may be occasionally larger than 2^51, to save on carry
+ * propagation costs.
+ */
+
+#define MASK51   (((uint64_t)1 << 51) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+	uint64_t m, w;
+
+	m = -(uint64_t)ctl;
+	w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+	w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+	w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+	w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+	w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w;
+}
+
+/*
+ * Addition with no carry propagation. Limbs double in size.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	d[0] = a[0] + b[0];
+	d[1] = a[1] + b[1];
+	d[2] = a[2] + b[2];
+	d[3] = a[3] + b[3];
+	d[4] = a[4] + b[4];
+}
+
+/*
+ * Subtraction.
+ * On input, limbs must fit on 60 bits each. On output, result is
+ * partially reduced, with max value 2^255+19456; moreover, all
+ * limbs will fit on 51 bits, except the low limb, which may have
+ * value up to 2^51+19455.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	uint64_t cc, w;
+
+	/*
+	 * We compute d = (2^255-19)*1024 + a - b. Since the limbs
+	 * fit on 60 bits, the maximum value of operands are slightly
+	 * more than 2^264, but much less than 2^265-19456. This
+	 * ensures that the result is positive.
+	 */
+
+	/*
+	 * Initial carry is 19456, since we add 2^265-19456. Each
+	 * individual subtraction may yield a carry up to 513.
+	 */
+	w = a[0] - b[0] - 19456;
+	d[0] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[1] - b[1] - cc;
+	d[1] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[2] - b[2] - cc;
+	d[2] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[3] - b[3] - cc;
+	d[3] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc;
+
+	/*
+	 * Partial reduction. The intermediate result may be up to
+	 * slightly above 2^265, but less than 2^265+2^255. When we
+	 * truncate to 255 bits, the upper bits will be at most 1024.
+	 */
+	d[0] += 19 * (d[4] >> 51);
+	d[4] &= MASK51;
+}
+
+/*
+ * UMUL51(hi, lo, x, y) computes:
+ *
+ *   hi = floor((x * y) / (2^51))
+ *   lo = x * y mod 2^51
+ *
+ * Note that lo < 2^51, but "hi" may be larger, if the input operands are
+ * larger.
+ */
+#if BR_INT128
+
+#define UMUL51(hi, lo, x, y)   do { \
+		unsigned __int128 umul_tmp; \
+		umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
+		(hi) = (uint64_t)(umul_tmp >> 51); \
+		(lo) = (uint64_t)umul_tmp & MASK51; \
+	} while (0)
+
+#elif BR_UMUL128
+
+#define UMUL51(hi, lo, x, y)   do { \
+		uint64_t umul_hi, umul_lo; \
+		umul_lo = _umul128((x), (y), &umul_hi); \
+		(hi) = (umul_hi << 13) | (umul_lo >> 51); \
+		(lo) = umul_lo & MASK51; \
+	} while (0)
+
+#endif
+
+/*
+ * Multiplication.
+ * On input, limbs must fit on 54 bits each.
+ * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
+ * on 51 bits each.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+	uint64_t t[10], hi, lo, w, cc;
+
+	/*
+	 * Perform cross products, accumulating values without carry
+	 * propagation.
+	 *
+	 * Since input limbs fit on 54 bits each, each individual
+	 * UMUL51 will produce a "hi" of less than 2^57. The maximum
+	 * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
+	 * i.e. less than 324*2^51.
+	 */
+
+	UMUL51(t[1], t[0], a[0], b[0]);
+
+	UMUL51(t[2], lo, a[1], b[0]); t[1] += lo;
+	UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi;
+
+	UMUL51(t[3], lo, a[2], b[0]); t[2] += lo;
+	UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi;
+	UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi;
+
+	UMUL51(t[4], lo, a[3], b[0]); t[3] += lo;
+	UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi;
+	UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi;
+	UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi;
+
+	UMUL51(t[5], lo, a[4], b[0]); t[4] += lo;
+	UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi;
+
+	UMUL51(t[6], lo, a[4], b[1]); t[5] += lo;
+	UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi;
+	UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi;
+	UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi;
+
+	UMUL51(t[7], lo, a[4], b[2]); t[6] += lo;
+	UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi;
+	UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi;
+
+	UMUL51(t[8], lo, a[4], b[3]); t[7] += lo;
+	UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi;
+
+	UMUL51(t[9], lo, a[4], b[4]); t[8] += lo;
+
+	/*
+	 * The upper words t[5]..t[9] are folded back into the lower
+	 * words, using the rule that 2^255 = 19 in the field.
+	 *
+	 * Since each t[i] is less than 324*2^51, the additions below
+	 * will yield less than 6480*2^51 in each limb; this fits in
+	 * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
+	 * no overflow.
+	 */
+	t[0] += 19 * t[5];
+	t[1] += 19 * t[6];
+	t[2] += 19 * t[7];
+	t[3] += 19 * t[8];
+	t[4] += 19 * t[9];
+
+	/*
+	 * Propagate carries.
+	 */
+	w = t[0];
+	d[0] = w & MASK51;
+	cc = w >> 51;
+	w = t[1] + cc;
+	d[1] = w & MASK51;
+	cc = w >> 51;
+	w = t[2] + cc;
+	d[2] = w & MASK51;
+	cc = w >> 51;
+	w = t[3] + cc;
+	d[3] = w & MASK51;
+	cc = w >> 51;
+	w = t[4] + cc;
+	d[4] = w & MASK51;
+	cc = w >> 51;
+
+	/*
+	 * Since the limbs were 64-bit values, the top carry is at
+	 * most 8192 (in practice, that cannot be reached). We simply
+	 * performed a partial reduction.
+	 */
+	d[0] += 19 * cc;
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ * Input must have limbs of 60 bits at most.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+	uint64_t t[5], cc, w;
+
+	/*
+	 * 121665 = 15 * 8111. We first multiply by 15, with carry
+	 * propagation and partial reduction.
+	 */
+	w = a[0] * 15;
+	t[0] = w & MASK51;
+	cc = w >> 51;
+	w = a[1] * 15 + cc;
+	t[1] = w & MASK51;
+	cc = w >> 51;
+	w = a[2] * 15 + cc;
+	t[2] = w & MASK51;
+	cc = w >> 51;
+	w = a[3] * 15 + cc;
+	t[3] = w & MASK51;
+	cc = w >> 51;
+	w = a[4] * 15 + cc;
+	t[4] = w & MASK51;
+	t[0] += 19 * (w >> 51);
+
+	/*
+	 * Then multiplication by 8111. At that point, we known that
+	 * t[0] is less than 2^51 + 19*8192, and other limbs are less
+	 * than 2^51; thus, there will be no overflow.
+	 */
+	w = t[0] * 8111;
+	d[0] = w & MASK51;
+	cc = w >> 51;
+	w = t[1] * 8111 + cc;
+	d[1] = w & MASK51;
+	cc = w >> 51;
+	w = t[2] * 8111 + cc;
+	d[2] = w & MASK51;
+	cc = w >> 51;
+	w = t[3] * 8111 + cc;
+	d[3] = w & MASK51;
+	cc = w >> 51;
+	w = t[4] * 8111 + cc;
+	d[4] = w & MASK51;
+	d[0] += 19 * (w >> 51);
+}
+
+/*
+ * Finalize reduction.
+ * On input, limbs must fit on 51 bits, except possibly the low limb,
+ * which may be slightly above 2^51.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+	uint64_t t[5], cc, w;
+
+	/*
+	 * We add 19. If the result (in t[]) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	w = a[0] + 19;
+	t[0] = w & MASK51;
+	cc = w >> 51;
+	w = a[1] + cc;
+	t[1] = w & MASK51;
+	cc = w >> 51;
+	w = a[2] + cc;
+	t[2] = w & MASK51;
+	cc = w >> 51;
+	w = a[3] + cc;
+	t[3] = w & MASK51;
+	cc = w >> 51;
+	w = a[4] + cc;
+	t[4] = w & MASK51;
+	cc = w >> 51;
+
+	/*
+	 * The bit 255 of t is in cc. If that bit is 0, when a[] must
+	 * be unchanged; otherwise, it must be replaced with t[].
+	 */
+	cc = -cc;
+	a[0] ^= cc & (a[0] ^ t[0]);
+	a[1] ^= cc & (a[1] ^ t[1]);
+	a[2] ^= cc & (a[2] ^ t[2]);
+	a[3] ^= cc & (a[3] ^ t[3]);
+	a[4] ^= cc & (a[4] ^ t[4]);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	unsigned char k[32];
+	uint64_t x1[5], x2[5], z2[5], x3[5], z3[5];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+
+	/*
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared; the "& MASK51" in the initialization for
+	 * x1[4] clears that bit.
+	 */
+	x1[0] = br_dec64le(&G[0]) & MASK51;
+	x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51;
+	x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51;
+	x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51;
+	x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51;
+
+	/*
+	 * We can use memset() to clear values, because exact-width types
+	 * like uint64_t are guaranteed to have no padding bits or
+	 * trap representations.
+	 */
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z2, 0, sizeof z2);
+	memcpy(x3, x1, sizeof x1);
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * The multiplier is provided in big-endian notation, and
+	 * possibly shorter than 32 bytes.
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	swap = 0;
+
+	for (i = 254; i >= 0; i --) {
+		uint64_t a[5], aa[5], b[5], bb[5], e[5];
+		uint64_t c[5], d[5], da[5], cb[5];
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		f255_cswap(x2, x3, swap);
+		f255_cswap(z2, z3, swap);
+		swap = kt;
+
+		/*
+		 * At that point, limbs of x_2 and z_2 are assumed to fit
+		 * on at most 52 bits each.
+		 *
+		 * Each f255_add() adds one bit to the maximum range of
+		 * the values, but f255_sub() and f255_mul() bring back
+		 * the limbs into 52 bits. All f255_add() outputs are
+		 * used only as inputs for f255_mul(), which ensures
+		 * that limbs remain in the proper range.
+		 */
+
+		/* A = x_2 + z_2   -- limbs fit on 53 bits each */
+		f255_add(a, x2, z2);
+
+		/* AA = A^2 */
+		f255_mul(aa, a, a);
+
+		/* B = x_2 - z_2 */
+		f255_sub(b, x2, z2);
+
+		/* BB = B^2 */
+		f255_mul(bb, b, b);
+
+		/* E = AA - BB */
+		f255_sub(e, aa, bb);
+
+		/* C = x_3 + z_3   -- limbs fit on 53 bits each */
+		f255_add(c, x3, z3);
+
+		/* D = x_3 - z_3 */
+		f255_sub(d, x3, z3);
+
+		/* DA = D * A */
+		f255_mul(da, d, a);
+
+		/* CB = C * B */
+		f255_mul(cb, c, b);
+
+		/* x_3 = (DA + CB)^2 */
+		f255_add(x3, da, cb);
+		f255_mul(x3, x3, x3);
+
+		/* z_3 = x_1 * (DA - CB)^2 */
+		f255_sub(z3, da, cb);
+		f255_mul(z3, z3, z3);
+		f255_mul(z3, x1, z3);
+
+		/* x_2 = AA * BB */
+		f255_mul(x2, aa, bb);
+
+		/* z_2 = E * (AA + a24 * E) */
+		f255_mul_a24(z2, e);
+		f255_add(z2, aa, z2);
+		f255_mul(z2, e, z2);
+	}
+
+	f255_cswap(x2, x3, swap);
+	f255_cswap(z2, z3, swap);
+
+	/*
+	 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+	 * most non-squarings. We use x1 and x3, now useless, as temporaries.
+	 */
+	memcpy(x1, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_mul(x1, x1, x1);
+		f255_mul(x1, x1, z2);
+	}
+	memcpy(x3, x1, sizeof x1);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_mul(x3, x3, x3);
+		}
+		f255_mul(x3, x3, x1);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_mul(x3, x3, x3);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(x3, z2, x3);
+		}
+	}
+
+	/*
+	 * Compute x2/z2. We have 1/z2 in x3.
+	 */
+	f255_mul(x2, x2, x3);
+	f255_final_reduce(x2);
+
+	/*
+	 * Encode the final x2 value in little-endian. We first assemble
+	 * the limbs into 64-bit values.
+	 */
+	x2[0] |= x2[1] << 51;
+	x2[1] = (x2[1] >> 13) | (x2[2] << 38);
+	x2[2] = (x2[2] >> 26) | (x2[3] << 25);
+	x2[3] = (x2[3] >> 39) | (x2[4] << 12);
+	br_enc64le(G, x2[0]);
+	br_enc64le(G + 8, x2[1]);
+	br_enc64le(G + 16, x2[2]);
+	br_enc64le(G + 24, x2[3]);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m62 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+	return &br_ec_c25519_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+	return 0;
+}
+
+#endif
--- a/src/bearssl/src/ec/ec_c25519_m64.c
+++ b/src/bearssl/src/ec/ec_c25519_m64.c
@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^63.
+ * Operations return partially reduced values, which may range up to
+ * 2^255+37.
+ */
+
+#define MASK63   (((uint64_t)1 << 63) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+	uint64_t m, w;
+
+	m = -(uint64_t)ctl;
+	w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+	w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+	w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+	w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+}
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+	cc = (uint64_t)(z >> 63);
+
+	/*
+	 * Since operands are at most 2^255+37, the sum is at most
+	 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+	 *
+	 * We use: 2^255 = 19 mod p.
+	 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+	 * the result is at most 2^255+37.
+	 */
+	z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned char k;
+
+	k = _addcarry_u64(0, a[0], b[0], &t0);
+	k = _addcarry_u64(k, a[1], b[1], &t1);
+	k = _addcarry_u64(k, a[2], b[2], &t2);
+	k = _addcarry_u64(k, a[3], b[3], &t3);
+	cc = (k << 1) + (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Since operands are at most 2^255+37, the sum is at most
+	 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+	 *
+	 * We use: 2^255 = 19 mod p.
+	 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+	 * the result is at most 2^255+37.
+	 */
+	k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	/*
+	 * We compute t = 2^256 - 38 + a - b, which is necessarily
+	 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+	 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+	 * on the two upper bits of t (bits 255 and 256).
+	 */
+
+	uint64_t t0, t1, t2, t3, t4, cc;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
+	t0 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
+		- (unsigned __int128)cc;
+	t1 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
+		- (unsigned __int128)cc;
+	t2 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
+		- (unsigned __int128)cc;
+	t3 = (uint64_t)z;
+	t4 = 1 + (uint64_t)(z >> 64);
+
+	/*
+	 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+	 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+	 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+	 * This guarantees that the result is at most 2^255+37.
+	 */
+	cc = (38 & -t4) + (19 & -(t3 >> 63));
+	t3 &= MASK63;
+	z = (unsigned __int128)t0 + (unsigned __int128)cc;
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	/*
+	 * We compute t = 2^256 - 38 + a - b, which is necessarily
+	 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+	 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+	 * on the two upper bits of t (bits 255 and 256).
+	 */
+
+	uint64_t t0, t1, t2, t3, t4;
+	unsigned char k;
+
+	k = _subborrow_u64(0, a[0], b[0], &t0);
+	k = _subborrow_u64(k, a[1], b[1], &t1);
+	k = _subborrow_u64(k, a[2], b[2], &t2);
+	k = _subborrow_u64(k, a[3], b[3], &t3);
+	(void)_subborrow_u64(k, 1, 0, &t4);
+
+	k = _subborrow_u64(0, t0, 38, &t0);
+	k = _subborrow_u64(k, t1, 0, &t1);
+	k = _subborrow_u64(k, t2, 0, &t2);
+	k = _subborrow_u64(k, t3, 0, &t3);
+	(void)_subborrow_u64(k, t4, 0, &t4);
+
+	/*
+	 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+	 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+	 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+	 * This guarantees that the result is at most 2^255+37.
+	 */
+	t4 = (38 & -t4) + (19 & -(t3 >> 63));
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, t4, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+#if BR_INT128
+
+	unsigned __int128 z;
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+
+	/*
+	 * Compute the product a*b over plain integers.
+	 */
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
+	t3 = (uint64_t)z;
+	t4 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t1;
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t2 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t3 + (z >> 64);
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	t5 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t2;
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t3 + (z >> 64);
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t5 + (z >> 64);
+	t5 = (uint64_t)z;
+	t6 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t3;
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t5 + (z >> 64);
+	t5 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t6 + (z >> 64);
+	t6 = (uint64_t)z;
+	t7 = (uint64_t)(z >> 64);
+
+	/*
+	 * Modulo p, we have:
+	 *
+	 *   2^255 = 19
+	 *   2^510 = 19*19 = 361
+	 *
+	 * We split the intermediate t into three parts, in basis
+	 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+	 * The upper one can only be a single bit (th), since the
+	 * multiplication operands are at most 2^255+37 each.
+	 */
+	th = t7 >> 62;
+	t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+	t6 = (t6 << 1) | (t5 >> 63);
+	t5 = (t5 << 1) | (t4 >> 63);
+	t4 = (t4 << 1) | (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Multiply the middle part (t4..t7) by 19. We truncate it to
+	 * 255 bits; the extra bits will go along with th.
+	 */
+	z = (unsigned __int128)t4 * 19;
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)t5 * 19 + (z >> 64);
+	t5 = (uint64_t)z;
+	z = (unsigned __int128)t6 * 19 + (z >> 64);
+	t6 = (uint64_t)z;
+	z = (unsigned __int128)t7 * 19 + (z >> 64);
+	t7 = (uint64_t)z & MASK63;
+
+	th = (361 & -th) + (19 * (uint64_t)(z >> 63));
+
+	/*
+	 * Add elements together.
+	 * At this point:
+	 *   t0..t3 fits on 255 bits.
+	 *   t4..t7 fits on 255 bits.
+	 *   th <= 361 + 342 = 703.
+	 */
+	z = (unsigned __int128)t0 + (unsigned __int128)t4
+		+ (unsigned __int128)th;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+	th = (uint64_t)(z >> 63);
+
+	/*
+	 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+	 * can only have value 0, 1 or 2. We just add th*19, which
+	 * guarantees a result of at most 2^255+37.
+	 */
+	z = (unsigned __int128)t0 + (19 * th);
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+	uint64_t h0, h1, h2, h3;
+	unsigned char k;
+
+	/*
+	 * Compute the product a*b over plain integers.
+	 */
+	t0 = _umul128(a[0], b[0], &h0);
+	t1 = _umul128(a[0], b[1], &h1);
+	k = _addcarry_u64(0, t1, h0, &t1);
+	t2 = _umul128(a[0], b[2], &h2);
+	k = _addcarry_u64(k, t2, h1, &t2);
+	t3 = _umul128(a[0], b[3], &h3);
+	k = _addcarry_u64(k, t3, h2, &t3);
+	(void)_addcarry_u64(k, h3, 0, &t4);
+
+	k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
+	k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
+	k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
+	t5 = k;
+	k = _addcarry_u64(0, t2, h0, &t2);
+	k = _addcarry_u64(k, t3, h1, &t3);
+	k = _addcarry_u64(k, t4, h2, &t4);
+	(void)_addcarry_u64(k, t5, h3, &t5);
+
+	k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
+	k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
+	k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
+	t6 = k;
+	k = _addcarry_u64(0, t3, h0, &t3);
+	k = _addcarry_u64(k, t4, h1, &t4);
+	k = _addcarry_u64(k, t5, h2, &t5);
+	(void)_addcarry_u64(k, t6, h3, &t6);
+
+	k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
+	k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
+	k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
+	t7 = k;
+	k = _addcarry_u64(0, t4, h0, &t4);
+	k = _addcarry_u64(k, t5, h1, &t5);
+	k = _addcarry_u64(k, t6, h2, &t6);
+	(void)_addcarry_u64(k, t7, h3, &t7);
+
+	/*
+	 * Modulo p, we have:
+	 *
+	 *   2^255 = 19
+	 *   2^510 = 19*19 = 361
+	 *
+	 * We split the intermediate t into three parts, in basis
+	 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+	 * The upper one can only be a single bit (th), since the
+	 * multiplication operands are at most 2^255+37 each.
+	 */
+	th = t7 >> 62;
+	t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+	t6 = (t6 << 1) | (t5 >> 63);
+	t5 = (t5 << 1) | (t4 >> 63);
+	t4 = (t4 << 1) | (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Multiply the middle part (t4..t7) by 19. We truncate it to
+	 * 255 bits; the extra bits will go along with th.
+	 */
+	t4 = _umul128(t4, 19, &h0);
+	t5 = _umul128(t5, 19, &h1);
+	t6 = _umul128(t6, 19, &h2);
+	t7 = _umul128(t7, 19, &h3);
+	k = _addcarry_u64(0, t5, h0, &t5);
+	k = _addcarry_u64(k, t6, h1, &t6);
+	k = _addcarry_u64(k, t7, h2, &t7);
+	(void)_addcarry_u64(k, h3, 0, &h3);
+	th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
+	t7 &= MASK63;
+
+	/*
+	 * Add elements together.
+	 * At this point:
+	 *   t0..t3 fits on 255 bits.
+	 *   t4..t7 fits on 255 bits.
+	 *   th <= 361 + 342 = 703.
+	 */
+	k = _addcarry_u64(0, t0, t4, &t0);
+	k = _addcarry_u64(k, t1, t5, &t1);
+	k = _addcarry_u64(k, t2, t6, &t2);
+	k = _addcarry_u64(k, t3, t7, &t3);
+	t4 = k;
+	k = _addcarry_u64(0, t0, th, &t0);
+	k = _addcarry_u64(k, t1, 0, &t1);
+	k = _addcarry_u64(k, t2, 0, &t2);
+	k = _addcarry_u64(k, t3, 0, &t3);
+	(void)_addcarry_u64(k, t4, 0, &t4);
+
+	th = (t4 << 1) + (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+	 * can only have value 0, 1 or 2. We just add th*19, which
+	 * guarantees a result of at most 2^255+37.
+	 */
+	k = _addcarry_u64(0, t0, 19 * th, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] * 121665;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * 121665 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * 121665 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * 121665 + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+
+	z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	t2 = (uint64_t)z;
+	t3 = t3 + (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
+	unsigned char k;
+
+	t0 = _umul128(a[0], 121665, &h0);
+	t1 = _umul128(a[1], 121665, &h1);
+	k = _addcarry_u64(0, t1, h0, &t1);
+	t2 = _umul128(a[2], 121665, &h2);
+	k = _addcarry_u64(k, t2, h1, &t2);
+	t3 = _umul128(a[3], 121665, &h3);
+	k = _addcarry_u64(k, t3, h2, &t3);
+	(void)_addcarry_u64(k, h3, 0, &t4);
+
+	t4 = (t4 << 1) + (t3 >> 63);
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, 19 * t4, &t0);
+	k = _addcarry_u64(k, t1, 0, &t1);
+	k = _addcarry_u64(k, t2, 0, &t2);
+	(void)_addcarry_u64(k, t3, 0, &t3);
+
+	t4 = 19 & -(t3 >> 63);
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, t4, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Finalize reduction.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned __int128 z;
+
+	/*
+	 * We add 19. If the result (in t) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	z = (unsigned __int128)a[0] + 19;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	t3 = a[3] + (uint64_t)(z >> 64);
+
+	m = -(t3 >> 63);
+	t3 &= MASK63;
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned char k;
+
+	/*
+	 * We add 19. If the result (in t) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	k = _addcarry_u64(0, a[0], 19, &t0);
+	k = _addcarry_u64(k, a[1], 0, &t1);
+	k = _addcarry_u64(k, a[2], 0, &t2);
+	(void)_addcarry_u64(k, a[3], 0, &t3);
+
+	m = -(t3 >> 63);
+	t3 &= MASK63;
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	unsigned char k[32];
+	uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+
+	/*
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	x1[0] = br_dec64le(&G[ 0]);
+	x1[1] = br_dec64le(&G[ 8]);
+	x1[2] = br_dec64le(&G[16]);
+	x1[3] = br_dec64le(&G[24]) & MASK63;
+
+	/*
+	 * We can use memset() to clear values, because exact-width types
+	 * like uint64_t are guaranteed to have no padding bits or
+	 * trap representations.
+	 */
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z2, 0, sizeof z2);
+	memcpy(x3, x1, sizeof x1);
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * The multiplier is provided in big-endian notation, and
+	 * possibly shorter than 32 bytes.
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	swap = 0;
+
+	for (i = 254; i >= 0; i --) {
+		uint64_t a[4], aa[4], b[4], bb[4], e[4];
+		uint64_t c[4], d[4], da[4], cb[4];
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		f255_cswap(x2, x3, swap);
+		f255_cswap(z2, z3, swap);
+		swap = kt;
+
+		/* A = x_2 + z_2 */
+		f255_add(a, x2, z2);
+
+		/* AA = A^2 */
+		f255_mul(aa, a, a);
+
+		/* B = x_2 - z_2 */
+		f255_sub(b, x2, z2);
+
+		/* BB = B^2 */
+		f255_mul(bb, b, b);
+
+		/* E = AA - BB */
+		f255_sub(e, aa, bb);
+
+		/* C = x_3 + z_3 */
+		f255_add(c, x3, z3);
+
+		/* D = x_3 - z_3 */
+		f255_sub(d, x3, z3);
+
+		/* DA = D * A */
+		f255_mul(da, d, a);
+
+		/* CB = C * B */
+		f255_mul(cb, c, b);
+
+		/* x_3 = (DA + CB)^2 */
+		f255_add(x3, da, cb);
+		f255_mul(x3, x3, x3);
+
+		/* z_3 = x_1 * (DA - CB)^2 */
+		f255_sub(z3, da, cb);
+		f255_mul(z3, z3, z3);
+		f255_mul(z3, x1, z3);
+
+		/* x_2 = AA * BB */
+		f255_mul(x2, aa, bb);
+
+		/* z_2 = E * (AA + a24 * E) */
+		f255_mul_a24(z2, e);
+		f255_add(z2, aa, z2);
+		f255_mul(z2, e, z2);
+	}
+
+	f255_cswap(x2, x3, swap);
+	f255_cswap(z2, z3, swap);
+
+	/*
+	 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+	 * most non-squarings. We use x1 and x3, now useless, as temporaries.
+	 */
+	memcpy(x1, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_mul(x1, x1, x1);
+		f255_mul(x1, x1, z2);
+	}
+	memcpy(x3, x1, sizeof x1);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_mul(x3, x3, x3);
+		}
+		f255_mul(x3, x3, x1);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_mul(x3, x3, x3);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(x3, z2, x3);
+		}
+	}
+
+	/*
+	 * Compute x2/z2. We have 1/z2 in x3.
+	 */
+	f255_mul(x2, x2, x3);
+	f255_final_reduce(x2);
+
+	/*
+	 * Encode the final x2 value in little-endian.
+	 */
+	br_enc64le(G,      x2[0]);
+	br_enc64le(G +  8, x2[1]);
+	br_enc64le(G + 16, x2[2]);
+	br_enc64le(G + 24, x2[3]);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m64 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+	return &br_ec_c25519_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+	return 0;
+}
+
+#endif
--- a/src/bearssl/src/ec/ec_p256_m15.c
+++ b/src/bearssl/src/ec/ec_p256_m15.c
@ -1739,7 +1739,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
 	memcpy(P->y, ty, sizeof ty);
 	memset(P->z, 0, sizeof P->z);
 	P->z[0] = 1;
-	return NEQ(bad, 0) ^ 1;
+	return EQ(bad, 0);
 }

 /*
--- a/src/bearssl/src/ec/ec_p256_m31.c
+++ b/src/bearssl/src/ec/ec_p256_m31.c
@ -1089,7 +1089,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
 	memcpy(P->y, ty, sizeof ty);
 	memset(P->z, 0, sizeof P->z);
 	P->z[0] = 1;
-	return NEQ(bad, 0) ^ 1;
+	return EQ(bad, 0);
 }

 /*
--- a/src/bearssl/src/ec/ec_p256_m62.c
+++ b/src/bearssl/src/ec/ec_p256_m62.c
--- a/src/bearssl/src/ec/ec_p256_m64.c
+++ b/src/bearssl/src/ec/ec_p256_m64.c
--- a/src/bearssl/src/int/i31_montmul.c
+++ b/src/bearssl/src/int/i31_montmul.c
@ -29,16 +29,45 @@ void
 br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
 	const uint32_t *m, uint32_t m0i)
 {
+	/*
+	 * Each outer loop iteration computes:
+	 *   d <- (d + xu*y + f*m) / 2^31
+	 * We have xu <= 2^31-1 and f <= 2^31-1.
+	 * Thus, if d <= 2*m-1 on input, then:
+	 *   2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
+	 * and the new d value is less than 2*m.
+	 *
+	 * We represent d over 31-bit words, with an extra word 'dh'
+	 * which can thus be only 0 or 1.
+	 */
 	size_t len, len4, u, v;
-	uint64_t dh;
+	uint32_t dh;

 	len = (m[0] + 31) >> 5;
 	len4 = len & ~(size_t)3;
 	br_i31_zero(d, m[0]);
 	dh = 0;
 	for (u = 0; u < len; u ++) {
+		/*
+		 * The carry for each operation fits on 32 bits:
+		 *   d[v+1] <= 2^31-1
+		 *   xu*y[v+1] <= (2^31-1)*(2^31-1)
+		 *   f*m[v+1] <= (2^31-1)*(2^31-1)
+		 *   r <= 2^32-1
+		 *   (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
+		 * After division by 2^31, the new r is then at most 2^32-1
+		 *
+		 * Using a 32-bit carry has performance benefits on 32-bit
+		 * systems; however, on 64-bit architectures, we prefer to
+		 * keep the carry (r) in a 64-bit register, thus avoiding some
+		 * "clear high bits" operations.
+		 */
 		uint32_t f, xu;
-		uint64_t r, zh;
+#if BR_64
+		uint64_t r;
+#else
+		uint32_t r;
+#endif

 		xu = x[u + 1];
 		f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i);
@ -73,9 +102,14 @@ br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
 			d[v] = (uint32_t)z & 0x7FFFFFFF;
 		}

-		zh = dh + r;
-		d[len] = (uint32_t)zh & 0x7FFFFFFF;
-		dh = zh >> 31;
+		/*
+		 * Since the new dh can only be 0 or 1, the addition of
+		 * the old dh with the carry MUST fit on 32 bits, and
+		 * thus can be done into dh itself.
+		 */
+		dh += r;
+		d[len] = dh & 0x7FFFFFFF;
+		dh >>= 31;
 	}

 	/*
--- a/src/bearssl/src/int/i31_mulacc.c
+++ b/src/bearssl/src/int/i31_mulacc.c
@ -45,7 +45,20 @@ br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	for (u = 0; u < blen; u ++) {
 		uint32_t f;
 		size_t v;
+
+		/*
+		 * Carry always fits on 31 bits; we want to keep it in a
+		 * 32-bit register on 32-bit architectures (on a 64-bit
+		 * architecture, cast down from 64 to 32 bits means
+		 * clearing the high bits, which is not free; on a 32-bit
+		 * architecture, the same operation really means ignoring
+		 * the top register, which has negative or zero cost).
+		 */
+#if BR_64
 		uint64_t cc;
+#else
+		uint32_t cc;
+#endif

 		f = b[1 + u];
 		cc = 0;
--- a/src/bearssl/src/int/i32_mulacc.c
+++ b/src/bearssl/src/int/i32_mulacc.c
@ -36,7 +36,11 @@ br_i32_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	for (u = 0; u < blen; u ++) {
 		uint32_t f;
 		size_t v;
+#if BR_64
 		uint64_t cc;
+#else
+		uint32_t cc;
+#endif

 		f = b[1 + u];
 		cc = 0;
--- a/src/bearssl/src/kdf/shake.c
+++ b/src/bearssl/src/kdf/shake.c
@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Round constants.
+ */
+static const uint64_t RC[] = {
+	0x0000000000000001, 0x0000000000008082,
+	0x800000000000808A, 0x8000000080008000,
+	0x000000000000808B, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009,
+	0x000000000000008A, 0x0000000000000088,
+	0x0000000080008009, 0x000000008000000A,
+	0x000000008000808B, 0x800000000000008B,
+	0x8000000000008089, 0x8000000000008003,
+	0x8000000000008002, 0x8000000000000080,
+	0x000000000000800A, 0x800000008000000A,
+	0x8000000080008081, 0x8000000000008080,
+	0x0000000080000001, 0x8000000080008008
+};
+
+/*
+ * XOR a block of data into the provided state. This supports only
+ * blocks whose length is a multiple of 64 bits.
+ */
+static void
+xor_block(uint64_t *A, const void *data, size_t rate)
+{
+	size_t u;
+
+	for (u = 0; u < rate; u += 8) {
+		A[u >> 3] ^= br_dec64le((const unsigned char *)data + u);
+	}
+}
+
+/*
+ * Process a block with the provided data. The data length must be a
+ * multiple of 8 (in bytes); normally, this is the "rate".
+ */
+static void
+process_block(uint64_t *A)
+{
+	uint64_t t0, t1, t2, t3, t4;
+	uint64_t tt0, tt1, tt2, tt3;
+	uint64_t t, kt;
+	uint64_t c0, c1, c2, c3, c4, bnn;
+	int j;
+
+	/*
+	 * Compute the 24 rounds. This loop is partially unrolled (each
+	 * iteration computes two rounds).
+	 */
+	for (j = 0; j < 24; j += 2) {
+
+		tt0 = A[ 1] ^ A[ 6];
+		tt1 = A[11] ^ A[16];
+		tt0 ^= A[21] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 4] ^ A[ 9];
+		tt3 = A[14] ^ A[19];
+		tt0 ^= A[24];
+		tt2 ^= tt3;
+		t0 = tt0 ^ tt2;
+
+		tt0 = A[ 2] ^ A[ 7];
+		tt1 = A[12] ^ A[17];
+		tt0 ^= A[22] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 0] ^ A[ 5];
+		tt3 = A[10] ^ A[15];
+		tt0 ^= A[20];
+		tt2 ^= tt3;
+		t1 = tt0 ^ tt2;
+
+		tt0 = A[ 3] ^ A[ 8];
+		tt1 = A[13] ^ A[18];
+		tt0 ^= A[23] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 1] ^ A[ 6];
+		tt3 = A[11] ^ A[16];
+		tt0 ^= A[21];
+		tt2 ^= tt3;
+		t2 = tt0 ^ tt2;
+
+		tt0 = A[ 4] ^ A[ 9];
+		tt1 = A[14] ^ A[19];
+		tt0 ^= A[24] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 2] ^ A[ 7];
+		tt3 = A[12] ^ A[17];
+		tt0 ^= A[22];
+		tt2 ^= tt3;
+		t3 = tt0 ^ tt2;
+
+		tt0 = A[ 0] ^ A[ 5];
+		tt1 = A[10] ^ A[15];
+		tt0 ^= A[20] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 3] ^ A[ 8];
+		tt3 = A[13] ^ A[18];
+		tt0 ^= A[23];
+		tt2 ^= tt3;
+		t4 = tt0 ^ tt2;
+
+		A[ 0] = A[ 0] ^ t0;
+		A[ 5] = A[ 5] ^ t0;
+		A[10] = A[10] ^ t0;
+		A[15] = A[15] ^ t0;
+		A[20] = A[20] ^ t0;
+		A[ 1] = A[ 1] ^ t1;
+		A[ 6] = A[ 6] ^ t1;
+		A[11] = A[11] ^ t1;
+		A[16] = A[16] ^ t1;
+		A[21] = A[21] ^ t1;
+		A[ 2] = A[ 2] ^ t2;
+		A[ 7] = A[ 7] ^ t2;
+		A[12] = A[12] ^ t2;
+		A[17] = A[17] ^ t2;
+		A[22] = A[22] ^ t2;
+		A[ 3] = A[ 3] ^ t3;
+		A[ 8] = A[ 8] ^ t3;
+		A[13] = A[13] ^ t3;
+		A[18] = A[18] ^ t3;
+		A[23] = A[23] ^ t3;
+		A[ 4] = A[ 4] ^ t4;
+		A[ 9] = A[ 9] ^ t4;
+		A[14] = A[14] ^ t4;
+		A[19] = A[19] ^ t4;
+		A[24] = A[24] ^ t4;
+		A[ 5] = (A[ 5] << 36) | (A[ 5] >> (64 - 36));
+		A[10] = (A[10] <<  3) | (A[10] >> (64 -  3));
+		A[15] = (A[15] << 41) | (A[15] >> (64 - 41));
+		A[20] = (A[20] << 18) | (A[20] >> (64 - 18));
+		A[ 1] = (A[ 1] <<  1) | (A[ 1] >> (64 -  1));
+		A[ 6] = (A[ 6] << 44) | (A[ 6] >> (64 - 44));
+		A[11] = (A[11] << 10) | (A[11] >> (64 - 10));
+		A[16] = (A[16] << 45) | (A[16] >> (64 - 45));
+		A[21] = (A[21] <<  2) | (A[21] >> (64 - 2));
+		A[ 2] = (A[ 2] << 62) | (A[ 2] >> (64 - 62));
+		A[ 7] = (A[ 7] <<  6) | (A[ 7] >> (64 -  6));
+		A[12] = (A[12] << 43) | (A[12] >> (64 - 43));
+		A[17] = (A[17] << 15) | (A[17] >> (64 - 15));
+		A[22] = (A[22] << 61) | (A[22] >> (64 - 61));
+		A[ 3] = (A[ 3] << 28) | (A[ 3] >> (64 - 28));
+		A[ 8] = (A[ 8] << 55) | (A[ 8] >> (64 - 55));
+		A[13] = (A[13] << 25) | (A[13] >> (64 - 25));
+		A[18] = (A[18] << 21) | (A[18] >> (64 - 21));
+		A[23] = (A[23] << 56) | (A[23] >> (64 - 56));
+		A[ 4] = (A[ 4] << 27) | (A[ 4] >> (64 - 27));
+		A[ 9] = (A[ 9] << 20) | (A[ 9] >> (64 - 20));
+		A[14] = (A[14] << 39) | (A[14] >> (64 - 39));
+		A[19] = (A[19] <<  8) | (A[19] >> (64 -  8));
+		A[24] = (A[24] << 14) | (A[24] >> (64 - 14));
+		bnn = ~A[12];
+		kt = A[ 6] | A[12];
+		c0 = A[ 0] ^ kt;
+		kt = bnn | A[18];
+		c1 = A[ 6] ^ kt;
+		kt = A[18] & A[24];
+		c2 = A[12] ^ kt;
+		kt = A[24] | A[ 0];
+		c3 = A[18] ^ kt;
+		kt = A[ 0] & A[ 6];
+		c4 = A[24] ^ kt;
+		A[ 0] = c0;
+		A[ 6] = c1;
+		A[12] = c2;
+		A[18] = c3;
+		A[24] = c4;
+		bnn = ~A[22];
+		kt = A[ 9] | A[10];
+		c0 = A[ 3] ^ kt;
+		kt = A[10] & A[16];
+		c1 = A[ 9] ^ kt;
+		kt = A[16] | bnn;
+		c2 = A[10] ^ kt;
+		kt = A[22] | A[ 3];
+		c3 = A[16] ^ kt;
+		kt = A[ 3] & A[ 9];
+		c4 = A[22] ^ kt;
+		A[ 3] = c0;
+		A[ 9] = c1;
+		A[10] = c2;
+		A[16] = c3;
+		A[22] = c4;
+		bnn = ~A[19];
+		kt = A[ 7] | A[13];
+		c0 = A[ 1] ^ kt;
+		kt = A[13] & A[19];
+		c1 = A[ 7] ^ kt;
+		kt = bnn & A[20];
+		c2 = A[13] ^ kt;
+		kt = A[20] | A[ 1];
+		c3 = bnn ^ kt;
+		kt = A[ 1] & A[ 7];
+		c4 = A[20] ^ kt;
+		A[ 1] = c0;
+		A[ 7] = c1;
+		A[13] = c2;
+		A[19] = c3;
+		A[20] = c4;
+		bnn = ~A[17];
+		kt = A[ 5] & A[11];
+		c0 = A[ 4] ^ kt;
+		kt = A[11] | A[17];
+		c1 = A[ 5] ^ kt;
+		kt = bnn | A[23];
+		c2 = A[11] ^ kt;
+		kt = A[23] & A[ 4];
+		c3 = bnn ^ kt;
+		kt = A[ 4] | A[ 5];
+		c4 = A[23] ^ kt;
+		A[ 4] = c0;
+		A[ 5] = c1;
+		A[11] = c2;
+		A[17] = c3;
+		A[23] = c4;
+		bnn = ~A[ 8];
+		kt = bnn & A[14];
+		c0 = A[ 2] ^ kt;
+		kt = A[14] | A[15];
+		c1 = bnn ^ kt;
+		kt = A[15] & A[21];
+		c2 = A[14] ^ kt;
+		kt = A[21] | A[ 2];
+		c3 = A[15] ^ kt;
+		kt = A[ 2] & A[ 8];
+		c4 = A[21] ^ kt;
+		A[ 2] = c0;
+		A[ 8] = c1;
+		A[14] = c2;
+		A[15] = c3;
+		A[21] = c4;
+		A[ 0] = A[ 0] ^ RC[j + 0];
+
+		tt0 = A[ 6] ^ A[ 9];
+		tt1 = A[ 7] ^ A[ 5];
+		tt0 ^= A[ 8] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[24] ^ A[22];
+		tt3 = A[20] ^ A[23];
+		tt0 ^= A[21];
+		tt2 ^= tt3;
+		t0 = tt0 ^ tt2;
+
+		tt0 = A[12] ^ A[10];
+		tt1 = A[13] ^ A[11];
+		tt0 ^= A[14] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 0] ^ A[ 3];
+		tt3 = A[ 1] ^ A[ 4];
+		tt0 ^= A[ 2];
+		tt2 ^= tt3;
+		t1 = tt0 ^ tt2;
+
+		tt0 = A[18] ^ A[16];
+		tt1 = A[19] ^ A[17];
+		tt0 ^= A[15] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 6] ^ A[ 9];
+		tt3 = A[ 7] ^ A[ 5];
+		tt0 ^= A[ 8];
+		tt2 ^= tt3;
+		t2 = tt0 ^ tt2;
+
+		tt0 = A[24] ^ A[22];
+		tt1 = A[20] ^ A[23];
+		tt0 ^= A[21] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[12] ^ A[10];
+		tt3 = A[13] ^ A[11];
+		tt0 ^= A[14];
+		tt2 ^= tt3;
+		t3 = tt0 ^ tt2;
+
+		tt0 = A[ 0] ^ A[ 3];
+		tt1 = A[ 1] ^ A[ 4];
+		tt0 ^= A[ 2] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[18] ^ A[16];
+		tt3 = A[19] ^ A[17];
+		tt0 ^= A[15];
+		tt2 ^= tt3;
+		t4 = tt0 ^ tt2;
+
+		A[ 0] = A[ 0] ^ t0;
+		A[ 3] = A[ 3] ^ t0;
+		A[ 1] = A[ 1] ^ t0;
+		A[ 4] = A[ 4] ^ t0;
+		A[ 2] = A[ 2] ^ t0;
+		A[ 6] = A[ 6] ^ t1;
+		A[ 9] = A[ 9] ^ t1;
+		A[ 7] = A[ 7] ^ t1;
+		A[ 5] = A[ 5] ^ t1;
+		A[ 8] = A[ 8] ^ t1;
+		A[12] = A[12] ^ t2;
+		A[10] = A[10] ^ t2;
+		A[13] = A[13] ^ t2;
+		A[11] = A[11] ^ t2;
+		A[14] = A[14] ^ t2;
+		A[18] = A[18] ^ t3;
+		A[16] = A[16] ^ t3;
+		A[19] = A[19] ^ t3;
+		A[17] = A[17] ^ t3;
+		A[15] = A[15] ^ t3;
+		A[24] = A[24] ^ t4;
+		A[22] = A[22] ^ t4;
+		A[20] = A[20] ^ t4;
+		A[23] = A[23] ^ t4;
+		A[21] = A[21] ^ t4;
+		A[ 3] = (A[ 3] << 36) | (A[ 3] >> (64 - 36));
+		A[ 1] = (A[ 1] <<  3) | (A[ 1] >> (64 -  3));
+		A[ 4] = (A[ 4] << 41) | (A[ 4] >> (64 - 41));
+		A[ 2] = (A[ 2] << 18) | (A[ 2] >> (64 - 18));
+		A[ 6] = (A[ 6] <<  1) | (A[ 6] >> (64 -  1));
+		A[ 9] = (A[ 9] << 44) | (A[ 9] >> (64 - 44));
+		A[ 7] = (A[ 7] << 10) | (A[ 7] >> (64 - 10));
+		A[ 5] = (A[ 5] << 45) | (A[ 5] >> (64 - 45));
+		A[ 8] = (A[ 8] <<  2) | (A[ 8] >> (64 - 2));
+		A[12] = (A[12] << 62) | (A[12] >> (64 - 62));
+		A[10] = (A[10] <<  6) | (A[10] >> (64 -  6));
+		A[13] = (A[13] << 43) | (A[13] >> (64 - 43));
+		A[11] = (A[11] << 15) | (A[11] >> (64 - 15));
+		A[14] = (A[14] << 61) | (A[14] >> (64 - 61));
+		A[18] = (A[18] << 28) | (A[18] >> (64 - 28));
+		A[16] = (A[16] << 55) | (A[16] >> (64 - 55));
+		A[19] = (A[19] << 25) | (A[19] >> (64 - 25));
+		A[17] = (A[17] << 21) | (A[17] >> (64 - 21));
+		A[15] = (A[15] << 56) | (A[15] >> (64 - 56));
+		A[24] = (A[24] << 27) | (A[24] >> (64 - 27));
+		A[22] = (A[22] << 20) | (A[22] >> (64 - 20));
+		A[20] = (A[20] << 39) | (A[20] >> (64 - 39));
+		A[23] = (A[23] <<  8) | (A[23] >> (64 -  8));
+		A[21] = (A[21] << 14) | (A[21] >> (64 - 14));
+		bnn = ~A[13];
+		kt = A[ 9] | A[13];
+		c0 = A[ 0] ^ kt;
+		kt = bnn | A[17];
+		c1 = A[ 9] ^ kt;
+		kt = A[17] & A[21];
+		c2 = A[13] ^ kt;
+		kt = A[21] | A[ 0];
+		c3 = A[17] ^ kt;
+		kt = A[ 0] & A[ 9];
+		c4 = A[21] ^ kt;
+		A[ 0] = c0;
+		A[ 9] = c1;
+		A[13] = c2;
+		A[17] = c3;
+		A[21] = c4;
+		bnn = ~A[14];
+		kt = A[22] | A[ 1];
+		c0 = A[18] ^ kt;
+		kt = A[ 1] & A[ 5];
+		c1 = A[22] ^ kt;
+		kt = A[ 5] | bnn;
+		c2 = A[ 1] ^ kt;
+		kt = A[14] | A[18];
+		c3 = A[ 5] ^ kt;
+		kt = A[18] & A[22];
+		c4 = A[14] ^ kt;
+		A[18] = c0;
+		A[22] = c1;
+		A[ 1] = c2;
+		A[ 5] = c3;
+		A[14] = c4;
+		bnn = ~A[23];
+		kt = A[10] | A[19];
+		c0 = A[ 6] ^ kt;
+		kt = A[19] & A[23];
+		c1 = A[10] ^ kt;
+		kt = bnn & A[ 2];
+		c2 = A[19] ^ kt;
+		kt = A[ 2] | A[ 6];
+		c3 = bnn ^ kt;
+		kt = A[ 6] & A[10];
+		c4 = A[ 2] ^ kt;
+		A[ 6] = c0;
+		A[10] = c1;
+		A[19] = c2;
+		A[23] = c3;
+		A[ 2] = c4;
+		bnn = ~A[11];
+		kt = A[ 3] & A[ 7];
+		c0 = A[24] ^ kt;
+		kt = A[ 7] | A[11];
+		c1 = A[ 3] ^ kt;
+		kt = bnn | A[15];
+		c2 = A[ 7] ^ kt;
+		kt = A[15] & A[24];
+		c3 = bnn ^ kt;
+		kt = A[24] | A[ 3];
+		c4 = A[15] ^ kt;
+		A[24] = c0;
+		A[ 3] = c1;
+		A[ 7] = c2;
+		A[11] = c3;
+		A[15] = c4;
+		bnn = ~A[16];
+		kt = bnn & A[20];
+		c0 = A[12] ^ kt;
+		kt = A[20] | A[ 4];
+		c1 = bnn ^ kt;
+		kt = A[ 4] & A[ 8];
+		c2 = A[20] ^ kt;
+		kt = A[ 8] | A[12];
+		c3 = A[ 4] ^ kt;
+		kt = A[12] & A[16];
+		c4 = A[ 8] ^ kt;
+		A[12] = c0;
+		A[16] = c1;
+		A[20] = c2;
+		A[ 4] = c3;
+		A[ 8] = c4;
+		A[ 0] = A[ 0] ^ RC[j + 1];
+		t = A[ 5];
+		A[ 5] = A[18];
+		A[18] = A[11];
+		A[11] = A[10];
+		A[10] = A[ 6];
+		A[ 6] = A[22];
+		A[22] = A[20];
+		A[20] = A[12];
+		A[12] = A[19];
+		A[19] = A[15];
+		A[15] = A[24];
+		A[24] = A[ 8];
+		A[ 8] = t;
+		t = A[ 1];
+		A[ 1] = A[ 9];
+		A[ 9] = A[14];
+		A[14] = A[ 2];
+		A[ 2] = A[13];
+		A[13] = A[23];
+		A[23] = A[ 4];
+		A[ 4] = A[21];
+		A[21] = A[16];
+		A[16] = A[ 3];
+		A[ 3] = A[17];
+		A[17] = A[ 7];
+		A[ 7] = t;
+	}
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_init(br_shake_context *sc, int security_level)
+{
+	sc->rate = 200 - (size_t)(security_level >> 2);
+	sc->dptr = 0;
+	memset(sc->A, 0, sizeof sc->A);
+	sc->A[ 1] = ~(uint64_t)0;
+	sc->A[ 2] = ~(uint64_t)0;
+	sc->A[ 8] = ~(uint64_t)0;
+	sc->A[12] = ~(uint64_t)0;
+	sc->A[17] = ~(uint64_t)0;
+	sc->A[20] = ~(uint64_t)0;
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_inject(br_shake_context *sc, const void *data, size_t len)
+{
+	const unsigned char *buf;
+	size_t rate, dptr;
+
+	buf = data;
+	rate = sc->rate;
+	dptr = sc->dptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = rate - dptr;
+		if (clen > len) {
+			clen = len;
+		}
+		memcpy(sc->dbuf + dptr, buf, clen);
+		dptr += clen;
+		buf += clen;
+		len -= clen;
+		if (dptr == rate) {
+			xor_block(sc->A, sc->dbuf, rate);
+			process_block(sc->A);
+			dptr = 0;
+		}
+	}
+	sc->dptr = dptr;
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_flip(br_shake_context *sc)
+{
+	/*
+	 * We apply padding and pre-XOR the value into the state. We
+	 * set dptr to the end of the buffer, so that first call to
+	 * shake_extract() will process the block.
+	 */
+	if ((sc->dptr + 1) == sc->rate) {
+		sc->dbuf[sc->dptr ++] = 0x9F;
+	} else {
+		sc->dbuf[sc->dptr ++] = 0x1F;
+		memset(sc->dbuf + sc->dptr, 0x00, sc->rate - sc->dptr - 1);
+		sc->dbuf[sc->rate - 1] = 0x80;
+		sc->dptr = sc->rate;
+	}
+	xor_block(sc->A, sc->dbuf, sc->rate);
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_produce(br_shake_context *sc, void *out, size_t len)
+{
+	unsigned char *buf;
+	size_t dptr, rate;
+
+	buf = out;
+	dptr = sc->dptr;
+	rate = sc->rate;
+	while (len > 0) {
+		size_t clen;
+
+		if (dptr == rate) {
+			unsigned char *dbuf;
+			uint64_t *A;
+
+			A = sc->A;
+			dbuf = sc->dbuf;
+			process_block(A);
+			br_enc64le(dbuf +   0,  A[ 0]);
+			br_enc64le(dbuf +   8, ~A[ 1]);
+			br_enc64le(dbuf +  16, ~A[ 2]);
+			br_enc64le(dbuf +  24,  A[ 3]);
+			br_enc64le(dbuf +  32,  A[ 4]);
+			br_enc64le(dbuf +  40,  A[ 5]);
+			br_enc64le(dbuf +  48,  A[ 6]);
+			br_enc64le(dbuf +  56,  A[ 7]);
+			br_enc64le(dbuf +  64, ~A[ 8]);
+			br_enc64le(dbuf +  72,  A[ 9]);
+			br_enc64le(dbuf +  80,  A[10]);
+			br_enc64le(dbuf +  88,  A[11]);
+			br_enc64le(dbuf +  96, ~A[12]);
+			br_enc64le(dbuf + 104,  A[13]);
+			br_enc64le(dbuf + 112,  A[14]);
+			br_enc64le(dbuf + 120,  A[15]);
+			br_enc64le(dbuf + 128,  A[16]);
+			br_enc64le(dbuf + 136, ~A[17]);
+			br_enc64le(dbuf + 144,  A[18]);
+			br_enc64le(dbuf + 152,  A[19]);
+			br_enc64le(dbuf + 160, ~A[20]);
+			br_enc64le(dbuf + 168,  A[21]);
+			br_enc64le(dbuf + 176,  A[22]);
+			br_enc64le(dbuf + 184,  A[23]);
+			br_enc64le(dbuf + 192,  A[24]);
+			dptr = 0;
+		}
+		clen = rate - dptr;
+		if (clen > len) {
+			clen = len;
+		}
+		memcpy(buf, sc->dbuf + dptr, clen);
+		dptr += clen;
+		buf += clen;
+		len -= clen;
+	}
+	sc->dptr = dptr;
+}
--- a/src/bearssl/src/rand/sysrng.c
+++ b/src/bearssl/src/rand/sysrng.c
@ -25,6 +25,10 @@
 #define BR_ENABLE_INTRINSICS   1
 #include "inner.h"

+#if BR_USE_GETENTROPY
+#include <unistd.h>
+#endif
+
 #if BR_USE_URANDOM
 #include <sys/types.h>
 #include <unistd.h>
@ -38,6 +42,9 @@
 #pragma comment(lib, "advapi32")
 #endif

+/*
+ * Seeder that uses the RDRAND opcodes (on x86 CPU).
+ */
 #if BR_RDRAND
 BR_TARGETS_X86_UP
 BR_TARGET("rdrnd")
@ -57,9 +64,24 @@ seeder_rdrand(const br_prng_class **ctx)
 		 *
 		 * Intel recommends trying at least 10 times in case of
 		 * failure.
+		 *
+		 * AMD bug: there are reports that some AMD processors
+		 * have a bug that makes them fail silently after a
+		 * suspend/resume cycle, in which case RDRAND will report
+		 * a success but always return 0xFFFFFFFF.
+		 * see: https://bugzilla.kernel.org/show_bug.cgi?id=85911
+		 *
+		 * As a mitigation, if the 32-bit value is 0 or -1, then
+		 * it is considered a failure and tried again. This should
+		 * reliably detect the buggy case, at least. This also
+		 * implies that the selected seed values can never be
+		 * 0x00000000 or 0xFFFFFFFF, which is not a problem since
+		 * we are generating a seed for a PRNG, and we overdo it
+		 * a bit (we generate 32 bytes of randomness, and 256 bits
+		 * of entropy are really overkill).
 		 */
 		for (j = 0; j < 10; j ++) {
-			if (_rdrand32_step(&x)) {
+			if (_rdrand32_step(&x) && x != 0 && x != (uint32_t)-1) {
 				goto next_word;
 			}
 		}
@ -80,9 +102,11 @@ rdrand_supported(void)
 	 */
 	return br_cpuid(0, 0, 0x40000000, 0);
 }
-
 #endif

+/*
+ * Seeder that uses /dev/urandom (on Unix-like systems).
+ */
 #if BR_USE_URANDOM
 static int
 seeder_urandom(const br_prng_class **ctx)
@ -116,6 +140,32 @@ seeder_urandom(const br_prng_class **ctx)
 }
 #endif

+/*
+ * Seeder that uses getentropy() (backed by getrandom() on some systems,
+ * e.g. Linux). On failure, it will use the /dev/urandom seeder (if
+ * enabled).
+ */
+#if BR_USE_GETENTROPY
+static int
+seeder_getentropy(const br_prng_class **ctx)
+{
+	unsigned char tmp[32];
+
+	if (getentropy(tmp, sizeof tmp) == 0) {
+		(*ctx)->update(ctx, tmp, sizeof tmp);
+		return 1;
+	}
+#if BR_USE_URANDOM
+	return seeder_urandom(ctx);
+#else
+	return 0;
+#endif
+}
+#endif
+
+/*
+ * Seeder that uses CryptGenRandom() (on Windows).
+ */
 #if BR_USE_WIN32_RAND
 static int
 seeder_win32(const br_prng_class **ctx)
@ -139,6 +189,29 @@ seeder_win32(const br_prng_class **ctx)
 }
 #endif

+/*
+ * An aggregate seeder that uses RDRAND, and falls back to an OS-provided
+ * source if RDRAND fails.
+ */
+#if BR_RDRAND && (BR_USE_GETENTROPY || BR_USE_URANDOM || BR_USE_WIN32_RAND)
+static int
+seeder_rdrand_with_fallback(const br_prng_class **ctx)
+{
+	if (!seeder_rdrand(ctx)) {
+#if BR_USE_GETENTROPY
+		return seeder_getentropy(ctx);
+#elif BR_USE_URANDOM
+		return seeder_urandom(ctx);
+#elif BR_USE_WIN32_RAND
+		return seeder_win32(ctx);
+#else
+#error "macro selection has gone wrong"
+#endif
+	}
+	return 1;
+}
+#endif
+
 /* see bearssl_rand.h */
 br_prng_seeder
 br_prng_seeder_system(const char **name)
@ -148,10 +221,19 @@ br_prng_seeder_system(const char **name)
 		if (name != NULL) {
 			*name = "rdrand";
 		}
+#if BR_USE_GETENTROPY || BR_USE_URANDOM || BR_USE_WIN32_RAND
+		return &seeder_rdrand_with_fallback;
+#else
 		return &seeder_rdrand;
+#endif
 	}
 #endif
-#if BR_USE_URANDOM
+#if BR_USE_GETENTROPY
+	if (name != NULL) {
+		*name = "getentropy";
+	}
+	return &seeder_getentropy;
+#elif BR_USE_URANDOM
 	if (name != NULL) {
 		*name = "urandom";
 	}
@ -161,9 +243,10 @@ br_prng_seeder_system(const char **name)
 		*name = "win32";
 	}
 	return &seeder_win32;
-#endif
+#else
 	if (name != NULL) {
 		*name = "none";
 	}
 	return 0;
+#endif
 }
--- a/src/bearssl/src/rsa/rsa_default_pss_sign.c
+++ b/src/bearssl/src/rsa/rsa_default_pss_sign.c
@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+br_rsa_pss_sign
+br_rsa_pss_sign_get_default(void)
+{
+#if BR_INT128 || BR_UMUL128
+	return &br_rsa_i62_pss_sign;
+#elif BR_LOMUL
+	return &br_rsa_i15_pss_sign;
+#else
+	return &br_rsa_i31_pss_sign;
+#endif
+}
--- a/src/bearssl/src/rsa/rsa_default_pss_vrfy.c
+++ b/src/bearssl/src/rsa/rsa_default_pss_vrfy.c
@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+br_rsa_pss_vrfy
+br_rsa_pss_vrfy_get_default(void)
+{
+#if BR_INT128 || BR_UMUL128
+	return &br_rsa_i62_pss_vrfy;
+#elif BR_LOMUL
+	return &br_rsa_i15_pss_vrfy;
+#else
+	return &br_rsa_i31_pss_vrfy;
+#endif
+}
--- a/src/bearssl/src/rsa/rsa_i15_keygen.c
+++ b/src/bearssl/src/rsa/rsa_i15_keygen.c
@ -318,9 +318,9 @@ mkprime(const br_prng_class **rng, uint16_t *x, uint32_t esize,
 			continue;
 		}
 		if ((pubexp == 3 && m3 == 1)
-			|| (pubexp == 5 && m5 == 5)
-			|| (pubexp == 7 && m5 == 7)
-			|| (pubexp == 11 && m5 == 11))
+			|| (pubexp == 5 && m5 == 1)
+			|| (pubexp == 7 && m7 == 1)
+			|| (pubexp == 11 && m11 == 1))
 		{
 			continue;
 		}
--- a/src/bearssl/src/rsa/rsa_i15_modulus.c
+++ b/src/bearssl/src/rsa/rsa_i15_modulus.c
@ -28,7 +28,7 @@
 size_t
 br_rsa_i15_compute_modulus(void *n, const br_rsa_private_key *sk)
 {
-	uint16_t tmp[2 * ((BR_MAX_RSA_SIZE + 14) / 15) + 5];
+	uint16_t tmp[4 * (((BR_MAX_RSA_SIZE / 2) + 14) / 15) + 5];
 	uint16_t *t, *p, *q;
 	const unsigned char *pbuf, *qbuf;
 	size_t nlen, plen, qlen, tlen;
--- a/src/bearssl/src/rsa/rsa_i15_pss_sign.c
+++ b/src/bearssl/src/rsa/rsa_i15_pss_sign.c
@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i15_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i15_private(x, sk);
+}
--- a/src/bearssl/src/rsa/rsa_i15_pss_vrfy.c
+++ b/src/bearssl/src/rsa/rsa_i15_pss_vrfy.c
@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i15_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i15_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
--- a/src/bearssl/src/rsa/rsa_i31_keygen_inner.c
+++ b/src/bearssl/src/rsa/rsa_i31_keygen_inner.c
@ -340,9 +340,9 @@ mkprime(const br_prng_class **rng, uint32_t *x, uint32_t esize,
 			continue;
 		}
 		if ((pubexp == 3 && m3 == 1)
-			|| (pubexp == 5 && m5 == 5)
-			|| (pubexp == 7 && m5 == 7)
-			|| (pubexp == 11 && m5 == 11))
+			|| (pubexp == 5 && m5 == 1)
+			|| (pubexp == 7 && m7 == 1)
+			|| (pubexp == 11 && m11 == 1))
 		{
 			continue;
 		}
--- a/src/bearssl/src/rsa/rsa_i31_modulus.c
+++ b/src/bearssl/src/rsa/rsa_i31_modulus.c
@ -28,7 +28,7 @@
 size_t
 br_rsa_i31_compute_modulus(void *n, const br_rsa_private_key *sk)
 {
-	uint32_t tmp[2 * ((BR_MAX_RSA_SIZE + 30) / 31) + 5];
+	uint32_t tmp[4 * (((BR_MAX_RSA_SIZE / 2) + 30) / 31) + 5];
 	uint32_t *t, *p, *q;
 	const unsigned char *pbuf, *qbuf;
 	size_t nlen, plen, qlen, tlen;
--- a/src/bearssl/src/rsa/rsa_i31_pss_sign.c
+++ b/src/bearssl/src/rsa/rsa_i31_pss_sign.c
@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i31_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i31_private(x, sk);
+}
--- a/src/bearssl/src/rsa/rsa_i31_pss_vrfy.c
+++ b/src/bearssl/src/rsa/rsa_i31_pss_vrfy.c
@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i31_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i31_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
--- a/src/bearssl/src/rsa/rsa_i32_pss_sign.c
+++ b/src/bearssl/src/rsa/rsa_i32_pss_sign.c
@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i32_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i32_private(x, sk);
+}
--- a/src/bearssl/src/rsa/rsa_i32_pss_vrfy.c
+++ b/src/bearssl/src/rsa/rsa_i32_pss_vrfy.c
@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i32_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i32_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
--- a/src/bearssl/src/rsa/rsa_i62_pss_sign.c
+++ b/src/bearssl/src/rsa/rsa_i62_pss_sign.c
@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i62_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i62_private(x, sk);
+}
+
+/* see bearssl_rsa.h */
+br_rsa_pss_sign
+br_rsa_i62_pss_sign_get(void)
+{
+	return &br_rsa_i62_pss_sign;
+}
+
+#else
+
+/* see bearssl_rsa.h */
+br_rsa_pss_sign
+br_rsa_i62_pss_sign_get(void)
+{
+	return 0;
+}
+
+#endif
--- a/src/bearssl/src/rsa/rsa_i62_pss_vrfy.c
+++ b/src/bearssl/src/rsa/rsa_i62_pss_vrfy.c
@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i62_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i62_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
+
+/* see bearssl_rsa.h */
+br_rsa_pss_vrfy
+br_rsa_i62_pss_vrfy_get(void)
+{
+	return &br_rsa_i62_pss_vrfy;
+}
+
+#else
+
+/* see bearssl_rsa.h */
+br_rsa_pss_vrfy
+br_rsa_i62_pss_vrfy_get(void)
+{
+	return 0;
+}
+
+#endif
--- a/src/bearssl/src/rsa/rsa_pss_sig_pad.c
+++ b/src/bearssl/src/rsa/rsa_pss_sig_pad.c
@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+uint32_t
+br_rsa_pss_sig_pad(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	uint32_t n_bitlen, unsigned char *x)
+{
+	size_t xlen, hash_len;
+	br_hash_compat_context hc;
+	unsigned char *salt, *seed;
+
+	hash_len = br_digest_size(hf_data);
+
+	/*
+	 * The padded string is one bit smaller than the modulus;
+	 * notably, if the modulus length is equal to 1 modulo 8, then
+	 * the padded string will be one _byte_ smaller, and the first
+	 * byte will be set to 0. We apply these transformations here.
+	 */
+	n_bitlen --;
+	if ((n_bitlen & 7) == 0) {
+		*x ++ = 0;
+	}
+	xlen = (n_bitlen + 7) >> 3;
+
+	/*
+	 * Check that the modulus is large enough for the hash value
+	 * length combined with the intended salt length.
+	 */
+	if (hash_len > xlen || salt_len > xlen
+		|| (hash_len + salt_len + 2) > xlen)
+	{
+		return 0;
+	}
+
+	/*
+	 * Produce a random salt.
+	 */
+	salt = x + xlen - hash_len - salt_len - 1;
+	if (salt_len != 0) {
+		(*rng)->generate(rng, salt, salt_len);
+	}
+
+	/*
+	 * Compute the seed for MGF1.
+	 */
+	seed = x + xlen - hash_len - 1;
+	hf_data->init(&hc.vtable);
+	memset(seed, 0, 8);
+	hf_data->update(&hc.vtable, seed, 8);
+	hf_data->update(&hc.vtable, hash, hash_len);
+	hf_data->update(&hc.vtable, salt, salt_len);
+	hf_data->out(&hc.vtable, seed);
+
+	/*
+	 * Prepare string PS (padded salt). The salt is already at the
+	 * right place.
+	 */
+	memset(x, 0, xlen - salt_len - hash_len - 2);
+	x[xlen - salt_len - hash_len - 2] = 0x01;
+
+	/*
+	 * Generate the mask and XOR it into PS.
+	 */
+	br_mgf1_xor(x, xlen - hash_len - 1, hf_mgf1, seed, hash_len);
+
+	/*
+	 * Clear the top bits to ensure the value is lower than the
+	 * modulus.
+	 */
+	x[0] &= 0xFF >> (((uint32_t)xlen << 3) - n_bitlen);
+
+	/*
+	 * The seed (H) is already in the right place. We just set the
+	 * last byte.
+	 */
+	x[xlen - 1] = 0xBC;
+
+	return 1;
+}
--- a/src/bearssl/src/rsa/rsa_pss_sig_unpad.c
+++ b/src/bearssl/src/rsa/rsa_pss_sig_unpad.c
@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+uint32_t
+br_rsa_pss_sig_unpad(const br_hash_class *hf_data,
+	const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_public_key *pk, unsigned char *x)
+{
+	size_t u, xlen, hash_len;
+	br_hash_compat_context hc;
+	unsigned char *seed, *salt;
+	unsigned char tmp[64];
+	uint32_t r, n_bitlen;
+
+	hash_len = br_digest_size(hf_data);
+
+	/*
+	 * Value r will be set to a non-zero value is any test fails.
+	 */
+	r = 0;
+
+	/*
+	 * The value bit length (as an integer) must be strictly less than
+	 * that of the modulus.
+	 */
+	for (u = 0; u < pk->nlen; u ++) {
+		if (pk->n[u] != 0) {
+			break;
+		}
+	}
+	if (u == pk->nlen) {
+		return 0;
+	}
+	n_bitlen = BIT_LENGTH(pk->n[u]) + ((uint32_t)(pk->nlen - u - 1) << 3);
+	n_bitlen --;
+	if ((n_bitlen & 7) == 0) {
+		r |= *x ++;
+	} else {
+		r |= x[0] & (0xFF << (n_bitlen & 7));
+	}
+	xlen = (n_bitlen + 7) >> 3;
+
+	/*
+	 * Check that the modulus is large enough for the hash value
+	 * length combined with the intended salt length.
+	 */
+	if (hash_len > xlen || salt_len > xlen
+		|| (hash_len + salt_len + 2) > xlen)
+	{
+		return 0;
+	}
+
+	/*
+	 * Check value of rightmost byte.
+	 */
+	r |= x[xlen - 1] ^ 0xBC;
+
+	/*
+	 * Generate the mask and XOR it into the first bytes to reveal PS;
+	 * we must also mask out the leading bits.
+	 */
+	seed = x + xlen - hash_len - 1;
+	br_mgf1_xor(x, xlen - hash_len - 1, hf_mgf1, seed, hash_len);
+	if ((n_bitlen & 7) != 0) {
+		x[0] &= 0xFF >> (8 - (n_bitlen & 7));
+	}
+
+	/*
+	 * Check that all padding bytes have the expected value.
+	 */
+	for (u = 0; u < (xlen - hash_len - salt_len - 2); u ++) {
+		r |= x[u];
+	}
+	r |= x[xlen - hash_len - salt_len - 2] ^ 0x01;
+
+	/*
+	 * Recompute H.
+	 */
+	salt = x + xlen - hash_len - salt_len - 1;
+	hf_data->init(&hc.vtable);
+	memset(tmp, 0, 8);
+	hf_data->update(&hc.vtable, tmp, 8);
+	hf_data->update(&hc.vtable, hash, hash_len);
+	hf_data->update(&hc.vtable, salt, salt_len);
+	hf_data->out(&hc.vtable, tmp);
+
+	/*
+	 * Check that the recomputed H value matches the one appearing
+	 * in the string.
+	 */
+	for (u = 0; u < hash_len; u ++) {
+		r |= tmp[u] ^ x[(xlen - salt_len - 1) + u];
+	}
+
+	return EQ0(r);
+}
--- a/src/bearssl/src/ssl/ssl_client_full.c
+++ b/src/bearssl/src/ssl/ssl_client_full.c
@ -119,7 +119,6 @@ br_ssl_client_init_full(br_ssl_client_context *cc,
 	 * to TLS-1.2 (inclusive).
 	 */
 	br_ssl_client_zero(cc);
-	memset(xc, 0, sizeof *xc);
 	br_ssl_engine_set_versions(&cc->eng, BR_TLS10, BR_TLS12);

 	/*
--- a/src/bearssl/src/ssl/ssl_engine.c
+++ b/src/bearssl/src/ssl/ssl_engine.c
@ -1232,6 +1232,21 @@ void
 br_ssl_engine_close(br_ssl_engine_context *cc)
 {
 	if (!br_ssl_engine_closed(cc)) {
+		/*
+		 * If we are not already closed, then we need to
+		 * initiate the closure. Once closing, any incoming
+		 * application data is discarded; we should also discard
+		 * application data which is already there but has not
+		 * been acknowledged by the application yet (this mimics
+		 * usual semantics on BSD sockets: you cannot read()
+		 * once you called close(), even if there was some
+		 * unread data already buffered).
+		 */
+		size_t len;
+
+		if (br_ssl_engine_recvapp_buf(cc, &len) != NULL && len != 0) {
+			br_ssl_engine_recvapp_ack(cc, len);
+		}
 		jump_handshake(cc, 1);
 	}
 }
--- a/src/bearssl/src/ssl/ssl_io.c
+++ b/src/bearssl/src/ssl/ssl_io.c
@ -48,8 +48,8 @@ br_sslio_init(br_sslio_context *ctx,
 * combination of both (the combination matches either). When a match is
 * achieved, this function returns 0. On error, it returns -1.
 */
-int
-br_run_until(br_sslio_context *ctx, unsigned target)
+static int
+run_until(br_sslio_context *ctx, unsigned target)
 {
 	for (;;) {
 		unsigned state;
@ -152,7 +152,7 @@ br_sslio_read(br_sslio_context *ctx, void *dst, size_t len)
 	if (len == 0) {
 		return 0;
 	}
-	if (br_run_until(ctx, BR_SSL_RECVAPP) < 0) {
+	if (run_until(ctx, BR_SSL_RECVAPP) < 0) {
 		return -1;
 	}
 	buf = br_ssl_engine_recvapp_buf(ctx->engine, &alen);
@ -194,7 +194,7 @@ br_sslio_write(br_sslio_context *ctx, const void *src, size_t len)
 	if (len == 0) {
 		return 0;
 	}
-	if (br_run_until(ctx, BR_SSL_SENDAPP) < 0) {
+	if (run_until(ctx, BR_SSL_SENDAPP) < 0) {
 		return -1;
 	}
 	buf = br_ssl_engine_sendapp_buf(ctx->engine, &alen);
@ -238,7 +238,7 @@ br_sslio_flush(br_sslio_context *ctx)
 	 * first sent down the wire before considering anything else.
 	 */
 	br_ssl_engine_flush(ctx->engine, 0);
-	return br_run_until(ctx, BR_SSL_SENDAPP | BR_SSL_RECVAPP);
+	return run_until(ctx, BR_SSL_SENDAPP | BR_SSL_RECVAPP);
 }

 /* see bearssl_ssl.h */
@ -252,7 +252,7 @@ br_sslio_close(br_sslio_context *ctx)
 		 */
 		size_t len;

-		br_run_until(ctx, BR_SSL_RECVAPP);
+		run_until(ctx, BR_SSL_RECVAPP);
 		if (br_ssl_engine_recvapp_buf(ctx->engine, &len) != NULL) {
 			br_ssl_engine_recvapp_ack(ctx->engine, len);
 		}
--- a/src/bearssl/src/x509/asn1.t0
+++ b/src/bearssl/src/x509/asn1.t0
@ -480,7 +480,7 @@ OID: id-at-commonName            2.5.4.3
 \ 66 noncharacters, and also the surrogate range; this function does NOT
 \ check that the value is in the 0..10FFFF range.
 : valid-unicode? ( val -- bool )
-	dup 0xFDD0 0xFEDF between? if drop 0 ret then
+	dup 0xFDD0 0xFDEF between? if drop 0 ret then
 	dup 0xD800 0xDFFF between? if drop 0 ret then
 	0xFFFF and 0xFFFE < ;

--- a/src/bearssl/src/x509/skey_decoder.c
+++ b/src/bearssl/src/x509/skey_decoder.c
@ -155,7 +155,7 @@ static const unsigned char t0_codeblock[] = {
 	0x02, 0x06, 0x1E, 0x00, 0x00, 0x19, 0x19, 0x00, 0x00, 0x01, 0x0B, 0x00,
 	0x00, 0x01, 0x00, 0x20, 0x14, 0x06, 0x08, 0x01, 0x01, 0x21, 0x20, 0x22,
 	0x20, 0x04, 0x75, 0x13, 0x00, 0x00, 0x01,
-	T0_INT2(3 * BR_X509_BUFSIZE_KEY), 0x00, 0x01, 0x01, 0x87, 0xFF, 0xFF,
+	T0_INT2(3 * BR_X509_BUFSIZE_SIG), 0x00, 0x01, 0x01, 0x87, 0xFF, 0xFF,
 	0x7F, 0x54, 0x57, 0x01, 0x02, 0x3E, 0x55, 0x01, 0x01, 0x0E, 0x06, 0x02,
 	0x30, 0x16, 0x57, 0x01, 0x02, 0x19, 0x0D, 0x06, 0x06, 0x13, 0x3B, 0x44,
 	0x32, 0x04, 0x1C, 0x01, 0x04, 0x19, 0x0D, 0x06, 0x08, 0x13, 0x3B, 0x01,
--- a/src/bearssl/src/x509/skey_decoder.t0
+++ b/src/bearssl/src/x509/skey_decoder.t0
@ -80,7 +80,7 @@ cc: read-blob-inner ( addr len -- addr len ) {

 \ Get the length of the key_data buffer.
 : len-key_data
-	CX 0 8191 { 3 * BR_X509_BUFSIZE_KEY } ;
+	CX 0 8191 { 3 * BR_X509_BUFSIZE_SIG } ;

 \ Get the address and length for the key_data buffer.
 : addr-len-key_data ( -- addr len )
--- a/src/bearssl/src/x509/x509_minimal.c
+++ b/src/bearssl/src/x509/x509_minimal.c
@ -703,7 +703,7 @@ static const unsigned char t0_codeblock[] = {
 	0x76, 0x00, 0x00, 0x01, 0x00, 0x30, 0x31, 0x0B, 0x42, 0x00, 0x00, 0x01,
 	0x81, 0x70, 0x00, 0x00, 0x01, 0x82, 0x0D, 0x00, 0x00, 0x01, 0x82, 0x22,
 	0x00, 0x00, 0x01, 0x82, 0x05, 0x00, 0x00, 0x01, 0x03, 0x33, 0x01, 0x03,
-	0x33, 0x00, 0x00, 0x25, 0x01, 0x83, 0xFB, 0x50, 0x01, 0x83, 0xFD, 0x5F,
+	0x33, 0x00, 0x00, 0x25, 0x01, 0x83, 0xFB, 0x50, 0x01, 0x83, 0xFB, 0x6F,
 	0x72, 0x06, 0x04, 0x24, 0x01, 0x00, 0x00, 0x25, 0x01, 0x83, 0xB0, 0x00,
 	0x01, 0x83, 0xBF, 0x7F, 0x72, 0x06, 0x04, 0x24, 0x01, 0x00, 0x00, 0x01,
 	0x83, 0xFF, 0x7F, 0x15, 0x01, 0x83, 0xFF, 0x7E, 0x0D, 0x00
--- a/src/bearssl/src/x509/x509_minimal.t0
+++ b/src/bearssl/src/x509/x509_minimal.t0
@ -106,7 +106,7 @@ preamble {
 *     -- Extensions: extension values are processed in due order.
 *
 *        -- Basic Constraints: for all certificates except EE, must be
- *        present, indicate a CA, and have a path legnth compatible with
+ *        present, indicate a CA, and have a path length compatible with
 *        the chain length so far.
 *
 *        -- Key Usage: for the EE, if present, must allow signatures
--- a/src/bearssl_ec.h
+++ b/src/bearssl_ec.h
@ -108,7 +108,7 @@ extern "C" {
 *
 *   - The multipliers (integers) MUST be lower than the subgroup order.
 *     If this property is not met, then the result is indeterminate,
- *     but an error value is not ncessearily returned.
+ *     but an error value is not necessarily returned.
 * 
 *
 * ## ECDSA
@ -451,6 +451,42 @@ extern const br_ec_impl br_ec_p256_m15;
 */
 extern const br_ec_impl br_ec_p256_m31;

+/**
+ * \brief EC implementation "m62" (specialised code) for P-256.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 64 bits, with a 128-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_p256_m62_get()` to dynamically obtain a pointer
+ * to that implementation.
+ */
+extern const br_ec_impl br_ec_p256_m62;
+
+/**
+ * \brief Get the "m62" implementation of P-256, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_p256_m62_get(void);
+
+/**
+ * \brief EC implementation "m64" (specialised code) for P-256.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 64 bits, with a 128-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_p256_m64_get()` to dynamically obtain a pointer
+ * to that implementation.
+ */
+extern const br_ec_impl br_ec_p256_m64;
+
+/**
+ * \brief Get the "m64" implementation of P-256, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_p256_m64_get(void);
+
 /**
 * \brief EC implementation "i15" (generic code) for Curve25519.
 *
@ -507,6 +543,54 @@ extern const br_ec_impl br_ec_c25519_m15;
 */
 extern const br_ec_impl br_ec_c25519_m31;

+/**
+ * \brief EC implementation "m62" (specialised code) for Curve25519.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 62 bits, with a 124-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_c25519_m62_get()` to dynamically obtain a pointer
+ * to that implementation. Due to the specificities of the curve
+ * definition, the following applies:
+ *
+ *   - `muladd()` is not implemented (the function returns 0 systematically).
+ *   - `order()` returns 2^255-1, since the point multiplication algorithm
+ *     accepts any 32-bit integer as input (it clears the top bit and low
+ *     three bits systematically).
+ */
+extern const br_ec_impl br_ec_c25519_m62;
+
+/**
+ * \brief Get the "m62" implementation of Curve25519, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_c25519_m62_get(void);
+
+/**
+ * \brief EC implementation "m64" (specialised code) for Curve25519.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 64 bits, with a 128-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_c25519_m64_get()` to dynamically obtain a pointer
+ * to that implementation. Due to the specificities of the curve
+ * definition, the following applies:
+ *
+ *   - `muladd()` is not implemented (the function returns 0 systematically).
+ *   - `order()` returns 2^255-1, since the point multiplication algorithm
+ *     accepts any 32-bit integer as input (it clears the top bit and low
+ *     three bits systematically).
+ */
+extern const br_ec_impl br_ec_c25519_m64;
+
+/**
+ * \brief Get the "m64" implementation of Curve25519, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_c25519_m64_get(void);
+
 /**
 * \brief Aggregate EC implementation "m15".
 *
--- a/src/bearssl_hash.h
+++ b/src/bearssl_hash.h
@ -724,7 +724,7 @@ void br_sha256_update(br_sha256_context *ctx, const void *data, size_t len);
 */
 void br_sha256_out(const br_sha256_context *ctx, void *out);

-#if BR_DOXYGEN_IGNORE
+#ifdef BR_DOXYGEN_IGNORE
 /**
 * \brief Save SHA-256 running state.
 *
@ -742,7 +742,7 @@ uint64_t br_sha256_state(const br_sha256_context *ctx, void *out);
 #define br_sha256_state       br_sha224_state
 #endif

-#if BR_DOXYGEN_IGNORE
+#ifdef BR_DOXYGEN_IGNORE
 /**
 * \brief Restore SHA-256 running state.
 *
--- a/src/bearssl_kdf.h
+++ b/src/bearssl_kdf.h
@ -81,6 +81,30 @@ extern "C" {
 * Note that the HKDF total output size (the number of bytes that
 * HKDF-Expand is willing to produce) is limited: if the hash output size
 * is _n_ bytes, then the maximum output size is _255*n_.
+ *
+ * ## SHAKE
+ *
+ * SHAKE is defined in
+ * [FIPS 202](https://csrc.nist.gov/publications/detail/fips/202/final)
+ * under two versions: SHAKE128 and SHAKE256, offering an alleged
+ * "security level" of 128 and 256 bits, respectively (SHAKE128 is
+ * about 20 to 25% faster than SHAKE256). SHAKE internally relies on
+ * the Keccak family of sponge functions, not on any externally provided
+ * hash function. Contrary to HKDF, SHAKE does not have a concept of
+ * either a "salt" or an "info" string. The API consists in four
+ * functions:
+ *
+ *  - `br_shake_init()`: initialize a SHAKE context for a given
+ *    security level.
+ *
+ *  - `br_shake_inject()`: inject more input bytes. This function may be
+ *    called repeatedly if the input data is provided by chunks.
+ *
+ *  - `br_shake_flip()`: end the data injection process, and start the
+ *    data production process.
+ *
+ *  - `br_shake_produce()`: get the next bytes of output. This function
+ *    may be called several times to obtain the full output by chunks.
 */

 /**
@ -178,6 +202,81 @@ void br_hkdf_flip(br_hkdf_context *hc);
 size_t br_hkdf_produce(br_hkdf_context *hc,
 	const void *info, size_t info_len, void *out, size_t out_len);

+/**
+ * \brief SHAKE context.
+ *
+ * The HKDF context is initialized with a "security level". The internal
+ * notion is called "capacity"; the capacity is twice the security level
+ * (for instance, SHAKE128 has capacity 256).
+ *
+ * The caller is responsible for allocating the context where
+ * appropriate. Context initialisation and usage incurs no dynamic
+ * allocation, so there is no release function.
+ */
+typedef struct {
+#ifndef BR_DOXYGEN_IGNORE
+	unsigned char dbuf[200];
+	size_t dptr;
+	size_t rate;
+	uint64_t A[25];
+#endif
+} br_shake_context;
+
+/**
+ * \brief SHAKE context initialization.
+ *
+ * The context is initialized for the provided "security level".
+ * Internally, this sets the "capacity" to twice the security level;
+ * thus, for SHAKE128, the `security_level` parameter should be 128,
+ * which corresponds to a 256-bit capacity.
+ *
+ * Allowed security levels are all multiples of 32, from 32 to 768,
+ * inclusive. Larger security levels imply lower performance; levels
+ * beyond 256 bits don't make much sense. Standard levels are 128
+ * and 256 bits (for SHAKE128 and SHAKE256, respectively).
+ *
+ * \param sc               SHAKE context to initialise.
+ * \param security_level   security level (in bits).
+ */
+void br_shake_init(br_shake_context *sc, int security_level);
+
+/**
+ * \brief SHAKE input injection.
+ *
+ * This function injects some more input bytes ("key material") into
+ * SHAKE. This function may be called several times, after `br_shake_init()`
+ * but before `br_shake_flip()`.
+ *
+ * \param sc     SHAKE context.
+ * \param data   extra input bytes.
+ * \param len    number of extra input bytes.
+ */
+void br_shake_inject(br_shake_context *sc, const void *data, size_t len);
+
+/**
+ * \brief SHAKE switch to production phase.
+ *
+ * This call terminates the input injection process, and starts the
+ * output production process.
+ *
+ * \param sc   SHAKE context.
+ */
+void br_shake_flip(br_shake_context *hc);
+
+/**
+ * \brief SHAKE output production.
+ *
+ * Produce more output bytes from the current state. This function may be
+ * called several times, but only after `br_shake_flip()`.
+ *
+ * There is no practical limit to the number of bytes that may be produced.
+ *
+ * \param sc    SHAKE context.
+ * \param out   destination buffer for the SHAKE output.
+ * \param len   the length of the requested output (in bytes).
+ */
+void br_shake_produce(br_shake_context *sc, void *out, size_t len);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/bearssl_rsa.h
+++ b/src/bearssl_rsa.h
@ -28,6 +28,7 @@
 #include <stddef.h>
 #include <stdint.h>

+#include "bearssl_hash.h"
 #include "bearssl_rand.h"

 #ifdef __cplusplus
@ -279,6 +280,55 @@ typedef uint32_t (*br_rsa_pkcs1_vrfy)(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);

+/**
+ * \brief Type for a RSA signature verification engine (PSS).
+ *
+ * Parameters are:
+ *
+ *   - The signature itself. The provided array is NOT modified.
+ *
+ *   - The hash function which was used to hash the message.
+ *
+ *   - The hash function to use with MGF1 within the PSS padding. This
+ *     is not necessarily the same hash function as the one which was
+ *     used to hash the signed message.
+ *
+ *   - The hashed message (as an array of bytes).
+ *
+ *   - The PSS salt length (in bytes).
+ *
+ *   - The public key.
+ *
+ * **Constraints:**
+ *
+ *   - Hash message length MUST be no more than 64 bytes.
+ *
+ * Note that, contrary to PKCS#1 v1.5 signature, the hash value of the
+ * signed data cannot be extracted from the signature; it must be
+ * provided to the verification function.
+ *
+ * This function verifies that the signature length (`xlen`) matches the
+ * modulus length (this function returns 0 on mismatch). If the modulus
+ * size exceeds the maximum supported RSA size, then the function also
+ * returns 0.
+ *
+ * Returned value is 1 on success, 0 on error.
+ *
+ * Implementations of this type need not be constant-time.
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+typedef uint32_t (*br_rsa_pss_vrfy)(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
 * \brief Type for a RSA encryption engine (OAEP).
 *
@ -385,6 +435,53 @@ typedef uint32_t (*br_rsa_pkcs1_sign)(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);

+/**
+ * \brief Type for a RSA signature generation engine (PSS).
+ *
+ * Parameters are:
+ *
+ *   - An initialized PRNG for salt generation. If the salt length is
+ *     zero (`salt_len` parameter), then the PRNG is optional (this is
+ *     not the typical case, as the security proof of RSA/PSS is
+ *     tighter when a non-empty salt is used).
+ *
+ *   - The hash function which was used to hash the message.
+ *
+ *   - The hash function to use with MGF1 within the PSS padding. This
+ *     is not necessarily the same function as the one used to hash the
+ *     message.
+ *
+ *   - The hashed message.
+ *
+ *   - The salt length, in bytes.
+ *
+ *   - The RSA private key.
+ *
+ *   - The output buffer, that receives the signature.
+ *
+ * Returned value is 1 on success, 0 on error. Error conditions include
+ * a too small modulus for the provided hash and salt lengths, or some
+ * invalid key parameters. The signature length is exactly
+ * `(sk->n_bitlen+7)/8` bytes.
+ *
+ * This function is expected to be constant-time with regards to the
+ * private key bytes (lengths of the modulus and the individual factors
+ * may leak, though) and to the hashed data.
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+typedef uint32_t (*br_rsa_pss_sign)(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /**
 * \brief Encoded OID for SHA-1 (in RSA PKCS#1 signatures).
 */
@ -476,7 +573,7 @@ uint32_t br_rsa_i32_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);

 /**
- * \brief RSA signature verification engine "i32".
+ * \brief RSA signature verification engine "i32" (PKCS#1 v1.5 signatures).
 *
 * \see br_rsa_pkcs1_vrfy
 *
@ -492,6 +589,24 @@ uint32_t br_rsa_i32_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);

+/**
+ * \brief RSA signature verification engine "i32" (PSS signatures).
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i32_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
 * \brief RSA private key engine "i32".
 *
@ -505,7 +620,7 @@ uint32_t br_rsa_i32_private(unsigned char *x,
 	const br_rsa_private_key *sk);

 /**
- * \brief RSA signature generation engine "i32".
+ * \brief RSA signature generation engine "i32" (PKCS#1 v1.5 signatures).
 *
 * \see br_rsa_pkcs1_sign
 *
@ -520,6 +635,25 @@ uint32_t br_rsa_i32_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);

+/**
+ * \brief RSA signature generation engine "i32" (PSS signatures).
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i32_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /*
 * RSA "i31" engine. Similar to i32, but only 31 bits are used per 32-bit
 * word. This uses slightly more stack space (about 4% more) and code
@ -540,7 +674,7 @@ uint32_t br_rsa_i31_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);

 /**
- * \brief RSA signature verification engine "i31".
+ * \brief RSA signature verification engine "i31" (PKCS#1 v1.5 signatures).
 *
 * \see br_rsa_pkcs1_vrfy
 *
@ -556,6 +690,24 @@ uint32_t br_rsa_i31_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);

+/**
+ * \brief RSA signature verification engine "i31" (PSS signatures).
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i31_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
 * \brief RSA private key engine "i31".
 *
@ -569,7 +721,7 @@ uint32_t br_rsa_i31_private(unsigned char *x,
 	const br_rsa_private_key *sk);

 /**
- * \brief RSA signature generation engine "i31".
+ * \brief RSA signature generation engine "i31" (PKCS#1 v1.5 signatures).
 *
 * \see br_rsa_pkcs1_sign
 *
@ -584,6 +736,25 @@ uint32_t br_rsa_i31_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);

+/**
+ * \brief RSA signature generation engine "i31" (PSS signatures).
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i31_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /*
 * RSA "i62" engine. Similar to i31, but internal multiplication use
 * 64x64->128 multiplications. This is available only on architecture
@ -608,7 +779,7 @@ uint32_t br_rsa_i62_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);

 /**
- * \brief RSA signature verification engine "i62".
+ * \brief RSA signature verification engine "i62" (PKCS#1 v1.5 signatures).
 *
 * This function is defined only on architecture that offer a 64x64->128
 * opcode. Use `br_rsa_i62_pkcs1_vrfy_get()` to dynamically obtain a pointer
@ -628,6 +799,28 @@ uint32_t br_rsa_i62_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);

+/**
+ * \brief RSA signature verification engine "i62" (PSS signatures).
+ *
+ * This function is defined only on architecture that offer a 64x64->128
+ * opcode. Use `br_rsa_i62_pss_vrfy_get()` to dynamically obtain a pointer
+ * to that function.
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i62_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
 * \brief RSA private key engine "i62".
 *
@ -645,7 +838,7 @@ uint32_t br_rsa_i62_private(unsigned char *x,
 	const br_rsa_private_key *sk);

 /**
- * \brief RSA signature generation engine "i62".
+ * \brief RSA signature generation engine "i62" (PKCS#1 v1.5 signatures).
 *
 * This function is defined only on architecture that offer a 64x64->128
 * opcode. Use `br_rsa_i62_pkcs1_sign_get()` to dynamically obtain a pointer
@ -664,6 +857,29 @@ uint32_t br_rsa_i62_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);

+/**
+ * \brief RSA signature generation engine "i62" (PSS signatures).
+ *
+ * This function is defined only on architecture that offer a 64x64->128
+ * opcode. Use `br_rsa_i62_pss_sign_get()` to dynamically obtain a pointer
+ * to that function.
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i62_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /**
 * \brief Get the RSA "i62" implementation (public key operations),
 * if available.
@ -673,13 +889,21 @@ uint32_t br_rsa_i62_pkcs1_sign(const unsigned char *hash_oid,
 br_rsa_public br_rsa_i62_public_get(void);

 /**
- * \brief Get the RSA "i62" implementation (PKCS#1 signature verification),
+ * \brief Get the RSA "i62" implementation (PKCS#1 v1.5 signature verification),
 * if available.
 *
 * \return  the implementation, or 0.
 */
 br_rsa_pkcs1_vrfy br_rsa_i62_pkcs1_vrfy_get(void);

+/**
+ * \brief Get the RSA "i62" implementation (PSS signature verification),
+ * if available.
+ *
+ * \return  the implementation, or 0.
+ */
+br_rsa_pss_vrfy br_rsa_i62_pss_vrfy_get(void);
+
 /**
 * \brief Get the RSA "i62" implementation (private key operations),
 * if available.
@ -689,13 +913,21 @@ br_rsa_pkcs1_vrfy br_rsa_i62_pkcs1_vrfy_get(void);
 br_rsa_private br_rsa_i62_private_get(void);

 /**
- * \brief Get the RSA "i62" implementation (PKCS#1 signature generation),
+ * \brief Get the RSA "i62" implementation (PKCS#1 v1.5 signature generation),
 * if available.
 *
 * \return  the implementation, or 0.
 */
 br_rsa_pkcs1_sign br_rsa_i62_pkcs1_sign_get(void);

+/**
+ * \brief Get the RSA "i62" implementation (PSS signature generation),
+ * if available.
+ *
+ * \return  the implementation, or 0.
+ */
+br_rsa_pss_sign br_rsa_i62_pss_sign_get(void);
+
 /**
 * \brief Get the RSA "i62" implementation (OAEP encryption),
 * if available.
@ -732,7 +964,7 @@ uint32_t br_rsa_i15_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);

 /**
- * \brief RSA signature verification engine "i15".
+ * \brief RSA signature verification engine "i15" (PKCS#1 v1.5 signatures).
 *
 * \see br_rsa_pkcs1_vrfy
 *
@ -748,6 +980,24 @@ uint32_t br_rsa_i15_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);

+/**
+ * \brief RSA signature verification engine "i15" (PSS signatures).
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i15_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
 * \brief RSA private key engine "i15".
 *
@ -761,7 +1011,7 @@ uint32_t br_rsa_i15_private(unsigned char *x,
 	const br_rsa_private_key *sk);

 /**
- * \brief RSA signature generation engine "i15".
+ * \brief RSA signature generation engine "i15" (PKCS#1 v1.5 signatures).
 *
 * \see br_rsa_pkcs1_sign
 *
@ -776,6 +1026,25 @@ uint32_t br_rsa_i15_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);

+/**
+ * \brief RSA signature generation engine "i15" (PSS signatures).
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i15_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /**
 * \brief Get "default" RSA implementation (public-key operations).
 *
@ -797,7 +1066,7 @@ br_rsa_public br_rsa_public_get_default(void);
 br_rsa_private br_rsa_private_get_default(void);

 /**
- * \brief Get "default" RSA implementation (PKCS#1 signature verification).
+ * \brief Get "default" RSA implementation (PKCS#1 v1.5 signature verification).
 *
 * This returns the preferred implementation of RSA (signature verification)
 * on the current system.
@ -807,7 +1076,17 @@ br_rsa_private br_rsa_private_get_default(void);
 br_rsa_pkcs1_vrfy br_rsa_pkcs1_vrfy_get_default(void);

 /**
- * \brief Get "default" RSA implementation (PKCS#1 signature generation).
+ * \brief Get "default" RSA implementation (PSS signature verification).
+ *
+ * This returns the preferred implementation of RSA (signature verification)
+ * on the current system.
+ *
+ * \return  the default implementation.
+ */
+br_rsa_pss_vrfy br_rsa_pss_vrfy_get_default(void);
+
+/**
+ * \brief Get "default" RSA implementation (PKCS#1 v1.5 signature generation).
 *
 * This returns the preferred implementation of RSA (signature generation)
 * on the current system.
@ -816,6 +1095,16 @@ br_rsa_pkcs1_vrfy br_rsa_pkcs1_vrfy_get_default(void);
 */
 br_rsa_pkcs1_sign br_rsa_pkcs1_sign_get_default(void);

+/**
+ * \brief Get "default" RSA implementation (PSS signature generation).
+ *
+ * This returns the preferred implementation of RSA (signature generation)
+ * on the current system.
+ *
+ * \return  the default implementation.
+ */
+br_rsa_pss_sign br_rsa_pss_sign_get_default(void);
+
 /**
 * \brief Get "default" RSA implementation (OAEP encryption).
 *
--- a/src/bearssl_ssl.h
+++ b/src/bearssl_ssl.h
@ -1250,8 +1250,8 @@ static inline void
 br_ssl_engine_set_versions(br_ssl_engine_context *cc,
 	unsigned version_min, unsigned version_max)
 {
-	cc->version_min = version_min;
-	cc->version_max = version_max;
+	cc->version_min = (uint16_t)version_min;
+	cc->version_max = (uint16_t)version_max;
 }

 /**
@ -1324,7 +1324,7 @@ br_ssl_engine_set_protocol_names(br_ssl_engine_context *ctx,
 	const char **names, size_t num)
 {
 	ctx->protocol_names = names;
-	ctx->protocol_names_num = num;
+	ctx->protocol_names_num = (uint16_t)num;
 }

 /**
@ -2102,7 +2102,7 @@ void br_ssl_engine_sendapp_ack(br_ssl_engine_context *cc, size_t len);
 /**
 * \brief Get buffer for received application data.
 *
- * If the engine has received application data from the peer, hen this
+ * If the engine has received application data from the peer, then this
 * call returns a pointer to the buffer from where such data shall be
 * read, and its length is written in `*len`. Otherwise, `*len` is set
 * to 0 and `NULL` is returned.
@ -4154,20 +4154,6 @@ int br_sslio_flush(br_sslio_context *cc);
 */
 int br_sslio_close(br_sslio_context *cc);

-/*
- * Run the engine, until the specified target state is achieved, or
- * an error occurs. The target state is SENDAPP, RECVAPP, or the
- * combination of both (the combination matches either). When a match is
- * achieved, this function returns 0. On error, it returns -1.
- * 
- * Static function made public since we would like to be able to
- * initialize the ssl socket in a single function
- * 
- * \return  0 on success, or -1 on error.
- */
-int
-br_run_until(br_sslio_context *ctx, unsigned target);
-
 /* ===================================================================== */

 /*
--- a/src/config.h
+++ b/src/config.h
@ -108,9 +108,27 @@
 #define BR_RDRAND   1
 */

+/*
+ * When BR_USE_GETENTROPY is enabled, the SSL engine will use the
+ * getentropy() function to obtain quality randomness for seeding its
+ * internal PRNG. On Linux and FreeBSD, getentropy() is implemented by
+ * the standard library with the system call getrandom(); on OpenBSD,
+ * getentropy() is the system call, and there is no getrandom() wrapper,
+ * hence the use of the getentropy() function for maximum portability.
+ *
+ * If the getentropy() call fails, and BR_USE_URANDOM is not explicitly
+ * disabled, then /dev/urandom will be used as a fallback mechanism. On
+ * FreeBSD and OpenBSD, this does not change much, since /dev/urandom
+ * will block if not enough entropy has been obtained since last boot.
+ * On Linux, /dev/urandom might not block, which can be troublesome in
+ * early boot stages, which is why getentropy() is preferred.
+ *
+#define BR_USE_GETENTROPY   1
+ */
+
 /*
 * When BR_USE_URANDOM is enabled, the SSL engine will use /dev/urandom
- * to automatically obtain quality randomness for seedings its internal
+ * to automatically obtain quality randomness for seeding its internal
 * PRNG.
 *
 #define BR_USE_URANDOM   1
@ -119,7 +137,7 @@
 /*
 * When BR_USE_WIN32_RAND is enabled, the SSL engine will use the Win32
 * (CryptoAPI) functions (CryptAcquireContext(), CryptGenRandom()...) to
- * automatically obtain quality randomness for seedings its internal PRNG.
+ * automatically obtain quality randomness for seeding its internal PRNG.
 *
 * Note: if both BR_USE_URANDOM and BR_USE_WIN32_RAND are defined, the
 * former takes precedence.
@ -132,10 +150,10 @@
 * the current time from the OS by calling time(), and assuming that the
 * returned value (a 'time_t') is an integer that counts time in seconds
 * since the Unix Epoch (Jan 1st, 1970, 00:00 UTC).
- *
 */
 #define BR_USE_UNIX_TIME   0

+
 /*
 * When BR_USE_WIN32_TIME is enabled, the X.509 validation engine obtains
 * the current time from the OS by calling the Win32 function
@ -143,8 +161,9 @@
 *
 * Note: if both BR_USE_UNIX_TIME and BR_USE_WIN32_TIME are defined, the
 * former takes precedence.
+ *
+#define BR_USE_WIN32_TIME   1
 */
-#define BR_USE_WIN32_TIME   0

 /*
 * When BR_ARMEL_CORTEXM_GCC is enabled, some operations are replaced with
@ -158,9 +177,7 @@
 * Note: if BR_LOMUL is not explicitly enabled or disabled, then
 * enabling BR_ARMEL_CORTEXM_GCC also enables BR_LOMUL.
 */
-#ifdef ARDUINO_ARCH_SAMD
 #define BR_ARMEL_CORTEXM_GCC   1
-#endif

 /*
 * When BR_AES_X86NI is enabled, the AES implementation using the x86 "NI"
--- a/src/inner.h
+++ b/src/inner.h
@ -114,6 +114,10 @@
 #define BR_64   1
 #elif defined(__x86_64__) || defined(_M_X64)
 #define BR_64   1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define BR_64   1
+#elif defined(__mips64)
+#define BR_64   1
 #endif
 #endif

@ -305,9 +309,20 @@
 * values are documented on:
 *    https://sourceforge.net/p/predef/wiki/OperatingSystems/
 *
- * TODO: enrich the list of detected system. Also add detection for
- * alternate system calls like getentropy(), which are usually
- * preferable when available.
+ * Win32's CryptGenRandom() should be available on Windows systems.
+ *
+ * /dev/urandom should work on all Unix-like systems (including macOS X).
+ *
+ * getentropy() is present on Linux (Glibc 2.25+), FreeBSD (12.0+) and
+ * OpenBSD (5.6+). For OpenBSD, there does not seem to be easy to use
+ * macros to test the minimum version, so we just assume that it is
+ * recent enough (last version without getentropy() has gone out of
+ * support in May 2015).
+ *
+ * Ideally we should use getentropy() on macOS (10.12+) too, but I don't
+ * know how to test the exact OS version with preprocessor macros.
+ *
+ * TODO: enrich the list of detected system.
 */

 #ifndef BR_USE_URANDOM
@ -324,6 +339,15 @@
 #endif
 #endif

+#ifndef BR_USE_GETENTROPY
+#if (defined __linux__ \
+	&& (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
+	|| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
+	|| defined __OpenBSD__
+#define BR_USE_GETENTROPY   1
+#endif
+#endif
+
 #ifndef BR_USE_WIN32_RAND
 #if defined _WIN32 || defined _WIN64
 #define BR_USE_WIN32_RAND   1
@ -1943,6 +1967,27 @@ uint32_t br_rsa_pkcs1_sig_unpad(const unsigned char *sig, size_t sig_len,
 	const unsigned char *hash_oid, size_t hash_len,
 	unsigned char *hash_out);

+/*
+ * Apply proper PSS padding. The 'x' buffer is output only: it
+ * receives the value that is to be exponentiated.
+ */
+uint32_t br_rsa_pss_sig_pad(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	uint32_t n_bitlen, unsigned char *x);
+
+/*
+ * Check PSS padding. The provided value is the one _after_
+ * the modular exponentiation; it is modified by this function.
+ * This function infers the signature length from the public key
+ * size, i.e. it assumes that this has already been verified (as
+ * part of the exponentiation).
+ */
+uint32_t br_rsa_pss_sig_unpad(
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_public_key *pk, unsigned char *x);
+
 /*
 * Apply OAEP padding. Returned value is the actual padded string length,
 * or zero on error.
@ -2448,8 +2493,8 @@ int br_ssl_choose_hash(unsigned bf);
 #else
 #define BR_TARGETS_X86_UP \
 	_Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
-#endif
 #define BR_TARGETS_X86_DOWN
+#endif
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif