diff --git a/src/SSLClient.cpp b/src/SSLClient.cpp
index 1d62fea..d1314be 100644
--- a/src/SSLClient.cpp
+++ b/src/SSLClient.cpp
@@ -392,6 +392,13 @@ int SSLClient::m_run_until(const unsigned target) {
         unsigned state = m_update_engine();
 	// error check
         if (state == BR_SSL_CLOSED || getWriteError() != SSL_OK) {
+            if (state == BR_SSL_CLOSED) {
+                m_warn("Terminating because the ssl engine closed", func_name);
+            }
+            else {
+                m_warn("Terminating with write error: ", func_name);
+                m_warn(getWriteError(), func_name);
+            }
             return -1;
         }
         // timeout check
@@ -406,7 +413,7 @@ int SSLClient::m_run_until(const unsigned target) {
             lastState = state;
             m_info("m_run changed state:", func_name);
             if(m_debug == DebugLevel::SSL_INFO) {
-                m_info("State: ", __func__);
+                m_info("State: ", func_name);
                 if(state == 0) Serial.println("    Invalid");
                 else if (state & BR_SSL_CLOSED) Serial.println("   Connection closed");
                 else {
@@ -728,6 +735,6 @@ void SSLClient::m_print_br_error(const unsigned br_error_code, const DebugLevel
     case BR_ERR_X509_FORBIDDEN_KEY_USAGE: Serial.println("Key Usage extension prohibits intended usage."); break;
     case BR_ERR_X509_WEAK_PUBLIC_KEY: Serial.println("Public key found in certificate is too small."); break;
     case BR_ERR_X509_NOT_TRUSTED: Serial.println("Chain could not be linked to a trust anchor."); break;
-    default: Serial.println("Unknown error code."); break;
+    default: Serial.print("Unknown error code: "); Serial.println(br_error_code); break;
   }
 }
diff --git a/src/bearssl/src/ec/ec_all_m31.c b/src/bearssl/src/ec/ec_all_m31.c
index 0552c4b..8fd8c3c 100644
--- a/src/bearssl/src/ec/ec_all_m31.c
+++ b/src/bearssl/src/ec/ec_all_m31.c
@@ -29,9 +29,17 @@ api_generator(int curve, size_t *len)
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.generator(curve, len);
+#else
 		return br_ec_p256_m31.generator(curve, len);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.generator(curve, len);
+#else
 		return br_ec_c25519_m31.generator(curve, len);
+#endif
 	default:
 		return br_ec_prime_i31.generator(curve, len);
 	}
@@ -42,9 +50,17 @@ api_order(int curve, size_t *len)
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.order(curve, len);
+#else
 		return br_ec_p256_m31.order(curve, len);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.order(curve, len);
+#else
 		return br_ec_c25519_m31.order(curve, len);
+#endif
 	default:
 		return br_ec_prime_i31.order(curve, len);
 	}
@@ -55,9 +71,17 @@ api_xoff(int curve, size_t *len)
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.xoff(curve, len);
+#else
 		return br_ec_p256_m31.xoff(curve, len);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.xoff(curve, len);
+#else
 		return br_ec_c25519_m31.xoff(curve, len);
+#endif
 	default:
 		return br_ec_prime_i31.xoff(curve, len);
 	}
@@ -69,9 +93,17 @@ api_mul(unsigned char *G, size_t Glen,
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.mul(G, Glen, kb, kblen, curve);
+#else
 		return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.mul(G, Glen, kb, kblen, curve);
+#else
 		return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve);
+#endif
 	default:
 		return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve);
 	}
@@ -83,9 +115,17 @@ api_mulgen(unsigned char *R,
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.mulgen(R, x, xlen, curve);
+#else
 		return br_ec_p256_m31.mulgen(R, x, xlen, curve);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.mulgen(R, x, xlen, curve);
+#else
 		return br_ec_c25519_m31.mulgen(R, x, xlen, curve);
+#endif
 	default:
 		return br_ec_prime_i31.mulgen(R, x, xlen, curve);
 	}
@@ -98,11 +138,21 @@ api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 {
 	switch (curve) {
 	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#else
 		return br_ec_p256_m31.muladd(A, B, len,
 			x, xlen, y, ylen, curve);
+#endif
 	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#else
 		return br_ec_c25519_m31.muladd(A, B, len,
 			x, xlen, y, ylen, curve);
+#endif
 	default:
 		return br_ec_prime_i31.muladd(A, B, len,
 			x, xlen, y, ylen, curve);
diff --git a/src/bearssl/src/ec/ec_c25519_i15.c b/src/bearssl/src/ec/ec_c25519_i15.c
index 361e75f..8fadcf4 100644
--- a/src/bearssl/src/ec/ec_c25519_i15.c
+++ b/src/bearssl/src/ec/ec_c25519_i15.c
@@ -239,11 +239,11 @@ api_mul(unsigned char *G, size_t Glen,
 	x2[1] = 19;
 	memcpy(z3, x2, ILEN);
 
-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
 
 	/* obsolete
 	print_int_mont("x1", x1);
@@ -253,7 +253,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;
 
-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
diff --git a/src/bearssl/src/ec/ec_c25519_i31.c b/src/bearssl/src/ec/ec_c25519_i31.c
index aa88dd6..f8ffc2c 100644
--- a/src/bearssl/src/ec/ec_c25519_i31.c
+++ b/src/bearssl/src/ec/ec_c25519_i31.c
@@ -214,7 +214,7 @@ api_mul(unsigned char *G, size_t Glen,
 	 *    br_i31_decode_reduce(a, G, 32, C255_P);
 	 */
 	br_i31_zero(b, 0x108);
-	b[9] = 0x0100;
+	b[9] = 0x0080;
 	br_i31_decode_mod(a, G, 32, b);
 	a[0] = 0x107;
 	br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0)));
@@ -230,11 +230,14 @@ api_mul(unsigned char *G, size_t Glen,
 	x2[1] = 0x13000000;
 	memcpy(z3, x2, sizeof x2);
 
-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	/*
+	 * kb[] is in big-endian notation, but possibly shorter than k[].
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
 
 	/* obsolete
 	print_int_mont("x1", x1);
@@ -244,7 +247,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;
 
-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
diff --git a/src/bearssl/src/ec/ec_c25519_m15.c b/src/bearssl/src/ec/ec_c25519_m15.c
index 0373197..deff55b 100644
--- a/src/bearssl/src/ec/ec_c25519_m15.c
+++ b/src/bearssl/src/ec/ec_c25519_m15.c
@@ -1332,11 +1332,11 @@ api_mul(unsigned char *G, size_t Glen,
 	memset(z3, 0, sizeof z3);
 	z3[0] = 1;
 
-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
 
 	/* obsolete
 	print_int("x1", x1);
@@ -1346,7 +1346,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;
 
-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
diff --git a/src/bearssl/src/ec/ec_c25519_m31.c b/src/bearssl/src/ec/ec_c25519_m31.c
index b249634..1dd6d51 100644
--- a/src/bearssl/src/ec/ec_c25519_m31.c
+++ b/src/bearssl/src/ec/ec_c25519_m31.c
@@ -372,8 +372,7 @@ reduce_final_f255(uint32_t *d)
 static void
 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 {
-	uint32_t t[18];
-	uint64_t cc, w;
+	uint32_t t[18], cc;
 	int i;
 
 	/*
@@ -389,21 +388,42 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	 * offset 9*30 = 270, word 9+k must be added to word k with
 	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
 	 * added that way.
+	 *
+	 * Keeping the carry on 32 bits helps with 32-bit architectures,
+	 * and does not noticeably impact performance on 64-bit systems.
 	 */
-	cc = MUL31(t[8] >> 15, 19);
+	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+		uint64_t w;
+
+		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
 		t[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		cc = (uint32_t)(w >> 30);  /* at most 622592 */
 	}
-	cc = MUL31(w >> 15, 19);
+
+	/*
+	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
+	 * This was split into two parts (upper of 257 bits, lower of 255
+	 * bits), and the upper was added to the lower with a factor 19,
+	 * which means that the intermediate value is less than 77*2^255
+	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
+	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
+	 */
+	cc = MUL15(t[8] >> 15, 19);
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = t[i] + cc;
-		d[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		uint32_t z;
+
+		z = t[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
 	}
+
+	/*
+	 * Final result is at most 2^255 + 1443. In particular, the last
+	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
+	 */
 }
 
 /*
@@ -415,8 +435,7 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 static void
 f255_square(uint32_t *d, const uint32_t *a)
 {
-	uint32_t t[18];
-	uint64_t cc, w;
+	uint32_t t[18], cc;
 	int i;
 
 	/*
@@ -428,24 +447,25 @@ f255_square(uint32_t *d, const uint32_t *a)
 
 	/*
 	 * Modular reduction: each high word is added where necessary.
-	 * Since the modulus is 2^255-19 and word 9 corresponds to
-	 * offset 9*30 = 270, word 9+k must be added to word k with
-	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
-	 * added that way.
+	 * See f255_mul() for details on the reduction and carry limits.
 	 */
-	cc = MUL31(t[8] >> 15, 19);
+	cc = MUL15(t[8] >> 15, 19);
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
+		uint64_t w;
+
+		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
 		t[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		cc = (uint32_t)(w >> 30);
 	}
-	cc = MUL31(w >> 15, 19);
+	cc = MUL15(t[8] >> 15, 19);
 	t[8] &= 0x7FFF;
 	for (i = 0; i < 9; i ++) {
-		w = t[i] + cc;
-		d[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		uint32_t z;
+
+		z = t[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
 	}
 }
 
@@ -515,20 +535,31 @@ static void
 f255_mul_a24(uint32_t *d, const uint32_t *a)
 {
 	int i;
-	uint64_t cc, w;
+	uint64_t w;
+	uint32_t cc;
 
+	/*
+	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
+	 * We single out the processing of the last word: intermediate
+	 * value w is up to 121665*2^16, yielding a carry for the next
+	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
+	 */
 	cc = 0;
-	for (i = 0; i < 9; i ++) {
-		w = MUL31(a[i], 121665) + cc;
+	for (i = 0; i < 8; i ++) {
+		w = MUL31(a[i], 121665) + (uint64_t)cc;
 		d[i] = (uint32_t)w & 0x3FFFFFFF;
-		cc = w >> 30;
+		cc = (uint32_t)(w >> 30);
 	}
-	cc = MUL31((uint32_t)(w >> 15), 19);
-	d[8] &= 0x7FFF;
+	w = MUL31(a[8], 121665) + (uint64_t)cc;
+	d[8] = (uint32_t)w & 0x7FFF;
+	cc = MUL15((uint32_t)(w >> 15), 19);
+
 	for (i = 0; i < 9; i ++) {
-		w = (uint64_t)d[i] + cc;
-		d[i] = w & 0x3FFFFFFF;
-		cc = w >> 30;
+		uint32_t z;
+
+		z = d[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
 	}
 }
 
@@ -623,11 +654,11 @@ api_mul(unsigned char *G, size_t Glen,
 	memset(z3, 0, sizeof z3);
 	z3[0] = 1;
 
-	memcpy(k, kb, kblen);
-	memset(k + kblen, 0, (sizeof k) - kblen);
-	k[0] &= 0xF8;
-	k[31] &= 0x7F;
-	k[31] |= 0x40;
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
 
 	/* obsolete
 	print_int("x1", x1);
@@ -637,7 +668,7 @@ api_mul(unsigned char *G, size_t Glen,
 	for (i = 254; i >= 0; i --) {
 		uint32_t kt;
 
-		kt = (k[i >> 3] >> (i & 7)) & 1;
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 		swap ^= kt;
 		cswap(x2, x3, swap);
 		cswap(z2, z3, swap);
diff --git a/src/bearssl/src/ec/ec_c25519_m62.c b/src/bearssl/src/ec/ec_c25519_m62.c
new file mode 100644
index 0000000..6b058eb
--- /dev/null
+++ b/src/bearssl/src/ec/ec_c25519_m62.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^51.
+ * Limbs may be occasionally larger than 2^51, to save on carry
+ * propagation costs.
+ */
+
+#define MASK51   (((uint64_t)1 << 51) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+	uint64_t m, w;
+
+	m = -(uint64_t)ctl;
+	w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+	w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+	w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+	w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+	w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w;
+}
+
+/*
+ * Addition with no carry propagation. Limbs double in size.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	d[0] = a[0] + b[0];
+	d[1] = a[1] + b[1];
+	d[2] = a[2] + b[2];
+	d[3] = a[3] + b[3];
+	d[4] = a[4] + b[4];
+}
+
+/*
+ * Subtraction.
+ * On input, limbs must fit on 60 bits each. On output, result is
+ * partially reduced, with max value 2^255+19456; moreover, all
+ * limbs will fit on 51 bits, except the low limb, which may have
+ * value up to 2^51+19455.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	uint64_t cc, w;
+
+	/*
+	 * We compute d = (2^255-19)*1024 + a - b. Since the limbs
+	 * fit on 60 bits, the maximum value of operands are slightly
+	 * more than 2^264, but much less than 2^265-19456. This
+	 * ensures that the result is positive.
+	 */
+
+	/*
+	 * Initial carry is 19456, since we add 2^265-19456. Each
+	 * individual subtraction may yield a carry up to 513.
+	 */
+	w = a[0] - b[0] - 19456;
+	d[0] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[1] - b[1] - cc;
+	d[1] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[2] - b[2] - cc;
+	d[2] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[3] - b[3] - cc;
+	d[3] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc;
+
+	/*
+	 * Partial reduction. The intermediate result may be up to
+	 * slightly above 2^265, but less than 2^265+2^255. When we
+	 * truncate to 255 bits, the upper bits will be at most 1024.
+	 */
+	d[0] += 19 * (d[4] >> 51);
+	d[4] &= MASK51;
+}
+
+/*
+ * UMUL51(hi, lo, x, y) computes:
+ *
+ *   hi = floor((x * y) / (2^51))
+ *   lo = x * y mod 2^51
+ *
+ * Note that lo < 2^51, but "hi" may be larger, if the input operands are
+ * larger.
+ */
+#if BR_INT128
+
+#define UMUL51(hi, lo, x, y)   do { \
+		unsigned __int128 umul_tmp; \
+		umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
+		(hi) = (uint64_t)(umul_tmp >> 51); \
+		(lo) = (uint64_t)umul_tmp & MASK51; \
+	} while (0)
+
+#elif BR_UMUL128
+
+#define UMUL51(hi, lo, x, y)   do { \
+		uint64_t umul_hi, umul_lo; \
+		umul_lo = _umul128((x), (y), &umul_hi); \
+		(hi) = (umul_hi << 13) | (umul_lo >> 51); \
+		(lo) = umul_lo & MASK51; \
+	} while (0)
+
+#endif
+
+/*
+ * Multiplication.
+ * On input, limbs must fit on 54 bits each.
+ * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
+ * on 51 bits each.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+	uint64_t t[10], hi, lo, w, cc;
+
+	/*
+	 * Perform cross products, accumulating values without carry
+	 * propagation.
+	 *
+	 * Since input limbs fit on 54 bits each, each individual
+	 * UMUL51 will produce a "hi" of less than 2^57. The maximum
+	 * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
+	 * i.e. less than 324*2^51.
+	 */
+
+	UMUL51(t[1], t[0], a[0], b[0]);
+
+	UMUL51(t[2], lo, a[1], b[0]); t[1] += lo;
+	UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi;
+
+	UMUL51(t[3], lo, a[2], b[0]); t[2] += lo;
+	UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi;
+	UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi;
+
+	UMUL51(t[4], lo, a[3], b[0]); t[3] += lo;
+	UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi;
+	UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi;
+	UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi;
+
+	UMUL51(t[5], lo, a[4], b[0]); t[4] += lo;
+	UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi;
+
+	UMUL51(t[6], lo, a[4], b[1]); t[5] += lo;
+	UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi;
+	UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi;
+	UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi;
+
+	UMUL51(t[7], lo, a[4], b[2]); t[6] += lo;
+	UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi;
+	UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi;
+
+	UMUL51(t[8], lo, a[4], b[3]); t[7] += lo;
+	UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi;
+
+	UMUL51(t[9], lo, a[4], b[4]); t[8] += lo;
+
+	/*
+	 * The upper words t[5]..t[9] are folded back into the lower
+	 * words, using the rule that 2^255 = 19 in the field.
+	 *
+	 * Since each t[i] is less than 324*2^51, the additions below
+	 * will yield less than 6480*2^51 in each limb; this fits in
+	 * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
+	 * no overflow.
+	 */
+	t[0] += 19 * t[5];
+	t[1] += 19 * t[6];
+	t[2] += 19 * t[7];
+	t[3] += 19 * t[8];
+	t[4] += 19 * t[9];
+
+	/*
+	 * Propagate carries.
+	 */
+	w = t[0];
+	d[0] = w & MASK51;
+	cc = w >> 51;
+	w = t[1] + cc;
+	d[1] = w & MASK51;
+	cc = w >> 51;
+	w = t[2] + cc;
+	d[2] = w & MASK51;
+	cc = w >> 51;
+	w = t[3] + cc;
+	d[3] = w & MASK51;
+	cc = w >> 51;
+	w = t[4] + cc;
+	d[4] = w & MASK51;
+	cc = w >> 51;
+
+	/*
+	 * Since the limbs were 64-bit values, the top carry is at
+	 * most 8192 (in practice, that cannot be reached). We simply
+	 * performed a partial reduction.
+	 */
+	d[0] += 19 * cc;
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ * Input must have limbs of 60 bits at most.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+	uint64_t t[5], cc, w;
+
+	/*
+	 * 121665 = 15 * 8111. We first multiply by 15, with carry
+	 * propagation and partial reduction.
+	 */
+	w = a[0] * 15;
+	t[0] = w & MASK51;
+	cc = w >> 51;
+	w = a[1] * 15 + cc;
+	t[1] = w & MASK51;
+	cc = w >> 51;
+	w = a[2] * 15 + cc;
+	t[2] = w & MASK51;
+	cc = w >> 51;
+	w = a[3] * 15 + cc;
+	t[3] = w & MASK51;
+	cc = w >> 51;
+	w = a[4] * 15 + cc;
+	t[4] = w & MASK51;
+	t[0] += 19 * (w >> 51);
+
+	/*
+	 * Then multiplication by 8111. At that point, we known that
+	 * t[0] is less than 2^51 + 19*8192, and other limbs are less
+	 * than 2^51; thus, there will be no overflow.
+	 */
+	w = t[0] * 8111;
+	d[0] = w & MASK51;
+	cc = w >> 51;
+	w = t[1] * 8111 + cc;
+	d[1] = w & MASK51;
+	cc = w >> 51;
+	w = t[2] * 8111 + cc;
+	d[2] = w & MASK51;
+	cc = w >> 51;
+	w = t[3] * 8111 + cc;
+	d[3] = w & MASK51;
+	cc = w >> 51;
+	w = t[4] * 8111 + cc;
+	d[4] = w & MASK51;
+	d[0] += 19 * (w >> 51);
+}
+
+/*
+ * Finalize reduction.
+ * On input, limbs must fit on 51 bits, except possibly the low limb,
+ * which may be slightly above 2^51.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+	uint64_t t[5], cc, w;
+
+	/*
+	 * We add 19. If the result (in t[]) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	w = a[0] + 19;
+	t[0] = w & MASK51;
+	cc = w >> 51;
+	w = a[1] + cc;
+	t[1] = w & MASK51;
+	cc = w >> 51;
+	w = a[2] + cc;
+	t[2] = w & MASK51;
+	cc = w >> 51;
+	w = a[3] + cc;
+	t[3] = w & MASK51;
+	cc = w >> 51;
+	w = a[4] + cc;
+	t[4] = w & MASK51;
+	cc = w >> 51;
+
+	/*
+	 * The bit 255 of t is in cc. If that bit is 0, when a[] must
+	 * be unchanged; otherwise, it must be replaced with t[].
+	 */
+	cc = -cc;
+	a[0] ^= cc & (a[0] ^ t[0]);
+	a[1] ^= cc & (a[1] ^ t[1]);
+	a[2] ^= cc & (a[2] ^ t[2]);
+	a[3] ^= cc & (a[3] ^ t[3]);
+	a[4] ^= cc & (a[4] ^ t[4]);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	unsigned char k[32];
+	uint64_t x1[5], x2[5], z2[5], x3[5], z3[5];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+
+	/*
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared; the "& MASK51" in the initialization for
+	 * x1[4] clears that bit.
+	 */
+	x1[0] = br_dec64le(&G[0]) & MASK51;
+	x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51;
+	x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51;
+	x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51;
+	x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51;
+
+	/*
+	 * We can use memset() to clear values, because exact-width types
+	 * like uint64_t are guaranteed to have no padding bits or
+	 * trap representations.
+	 */
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z2, 0, sizeof z2);
+	memcpy(x3, x1, sizeof x1);
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * The multiplier is provided in big-endian notation, and
+	 * possibly shorter than 32 bytes.
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	swap = 0;
+
+	for (i = 254; i >= 0; i --) {
+		uint64_t a[5], aa[5], b[5], bb[5], e[5];
+		uint64_t c[5], d[5], da[5], cb[5];
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		f255_cswap(x2, x3, swap);
+		f255_cswap(z2, z3, swap);
+		swap = kt;
+
+		/*
+		 * At that point, limbs of x_2 and z_2 are assumed to fit
+		 * on at most 52 bits each.
+		 *
+		 * Each f255_add() adds one bit to the maximum range of
+		 * the values, but f255_sub() and f255_mul() bring back
+		 * the limbs into 52 bits. All f255_add() outputs are
+		 * used only as inputs for f255_mul(), which ensures
+		 * that limbs remain in the proper range.
+		 */
+
+		/* A = x_2 + z_2   -- limbs fit on 53 bits each */
+		f255_add(a, x2, z2);
+
+		/* AA = A^2 */
+		f255_mul(aa, a, a);
+
+		/* B = x_2 - z_2 */
+		f255_sub(b, x2, z2);
+
+		/* BB = B^2 */
+		f255_mul(bb, b, b);
+
+		/* E = AA - BB */
+		f255_sub(e, aa, bb);
+
+		/* C = x_3 + z_3   -- limbs fit on 53 bits each */
+		f255_add(c, x3, z3);
+
+		/* D = x_3 - z_3 */
+		f255_sub(d, x3, z3);
+
+		/* DA = D * A */
+		f255_mul(da, d, a);
+
+		/* CB = C * B */
+		f255_mul(cb, c, b);
+
+		/* x_3 = (DA + CB)^2 */
+		f255_add(x3, da, cb);
+		f255_mul(x3, x3, x3);
+
+		/* z_3 = x_1 * (DA - CB)^2 */
+		f255_sub(z3, da, cb);
+		f255_mul(z3, z3, z3);
+		f255_mul(z3, x1, z3);
+
+		/* x_2 = AA * BB */
+		f255_mul(x2, aa, bb);
+
+		/* z_2 = E * (AA + a24 * E) */
+		f255_mul_a24(z2, e);
+		f255_add(z2, aa, z2);
+		f255_mul(z2, e, z2);
+	}
+
+	f255_cswap(x2, x3, swap);
+	f255_cswap(z2, z3, swap);
+
+	/*
+	 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+	 * most non-squarings. We use x1 and x3, now useless, as temporaries.
+	 */
+	memcpy(x1, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_mul(x1, x1, x1);
+		f255_mul(x1, x1, z2);
+	}
+	memcpy(x3, x1, sizeof x1);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_mul(x3, x3, x3);
+		}
+		f255_mul(x3, x3, x1);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_mul(x3, x3, x3);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(x3, z2, x3);
+		}
+	}
+
+	/*
+	 * Compute x2/z2. We have 1/z2 in x3.
+	 */
+	f255_mul(x2, x2, x3);
+	f255_final_reduce(x2);
+
+	/*
+	 * Encode the final x2 value in little-endian. We first assemble
+	 * the limbs into 64-bit values.
+	 */
+	x2[0] |= x2[1] << 51;
+	x2[1] = (x2[1] >> 13) | (x2[2] << 38);
+	x2[2] = (x2[2] >> 26) | (x2[3] << 25);
+	x2[3] = (x2[3] >> 39) | (x2[4] << 12);
+	br_enc64le(G, x2[0]);
+	br_enc64le(G + 8, x2[1]);
+	br_enc64le(G + 16, x2[2]);
+	br_enc64le(G + 24, x2[3]);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m62 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+	return &br_ec_c25519_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/bearssl/src/ec/ec_c25519_m64.c b/src/bearssl/src/ec/ec_c25519_m64.c
new file mode 100644
index 0000000..df48834
--- /dev/null
+++ b/src/bearssl/src/ec/ec_c25519_m64.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^63.
+ * Operations return partially reduced values, which may range up to
+ * 2^255+37.
+ */
+
+#define MASK63   (((uint64_t)1 << 63) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+	uint64_t m, w;
+
+	m = -(uint64_t)ctl;
+	w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+	w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+	w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+	w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+}
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+	cc = (uint64_t)(z >> 63);
+
+	/*
+	 * Since operands are at most 2^255+37, the sum is at most
+	 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+	 *
+	 * We use: 2^255 = 19 mod p.
+	 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+	 * the result is at most 2^255+37.
+	 */
+	z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned char k;
+
+	k = _addcarry_u64(0, a[0], b[0], &t0);
+	k = _addcarry_u64(k, a[1], b[1], &t1);
+	k = _addcarry_u64(k, a[2], b[2], &t2);
+	k = _addcarry_u64(k, a[3], b[3], &t3);
+	cc = (k << 1) + (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Since operands are at most 2^255+37, the sum is at most
+	 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+	 *
+	 * We use: 2^255 = 19 mod p.
+	 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+	 * the result is at most 2^255+37.
+	 */
+	k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	/*
+	 * We compute t = 2^256 - 38 + a - b, which is necessarily
+	 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+	 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+	 * on the two upper bits of t (bits 255 and 256).
+	 */
+
+	uint64_t t0, t1, t2, t3, t4, cc;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
+	t0 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
+		- (unsigned __int128)cc;
+	t1 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
+		- (unsigned __int128)cc;
+	t2 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
+		- (unsigned __int128)cc;
+	t3 = (uint64_t)z;
+	t4 = 1 + (uint64_t)(z >> 64);
+
+	/*
+	 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+	 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+	 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+	 * This guarantees that the result is at most 2^255+37.
+	 */
+	cc = (38 & -t4) + (19 & -(t3 >> 63));
+	t3 &= MASK63;
+	z = (unsigned __int128)t0 + (unsigned __int128)cc;
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	/*
+	 * We compute t = 2^256 - 38 + a - b, which is necessarily
+	 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+	 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+	 * on the two upper bits of t (bits 255 and 256).
+	 */
+
+	uint64_t t0, t1, t2, t3, t4;
+	unsigned char k;
+
+	k = _subborrow_u64(0, a[0], b[0], &t0);
+	k = _subborrow_u64(k, a[1], b[1], &t1);
+	k = _subborrow_u64(k, a[2], b[2], &t2);
+	k = _subborrow_u64(k, a[3], b[3], &t3);
+	(void)_subborrow_u64(k, 1, 0, &t4);
+
+	k = _subborrow_u64(0, t0, 38, &t0);
+	k = _subborrow_u64(k, t1, 0, &t1);
+	k = _subborrow_u64(k, t2, 0, &t2);
+	k = _subborrow_u64(k, t3, 0, &t3);
+	(void)_subborrow_u64(k, t4, 0, &t4);
+
+	/*
+	 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+	 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+	 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+	 * This guarantees that the result is at most 2^255+37.
+	 */
+	t4 = (38 & -t4) + (19 & -(t3 >> 63));
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, t4, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+#if BR_INT128
+
+	unsigned __int128 z;
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+
+	/*
+	 * Compute the product a*b over plain integers.
+	 */
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
+	t3 = (uint64_t)z;
+	t4 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t1;
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t2 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t3 + (z >> 64);
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	t5 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t2;
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t3 + (z >> 64);
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t5 + (z >> 64);
+	t5 = (uint64_t)z;
+	t6 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t3;
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t5 + (z >> 64);
+	t5 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t6 + (z >> 64);
+	t6 = (uint64_t)z;
+	t7 = (uint64_t)(z >> 64);
+
+	/*
+	 * Modulo p, we have:
+	 *
+	 *   2^255 = 19
+	 *   2^510 = 19*19 = 361
+	 *
+	 * We split the intermediate t into three parts, in basis
+	 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+	 * The upper one can only be a single bit (th), since the
+	 * multiplication operands are at most 2^255+37 each.
+	 */
+	th = t7 >> 62;
+	t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+	t6 = (t6 << 1) | (t5 >> 63);
+	t5 = (t5 << 1) | (t4 >> 63);
+	t4 = (t4 << 1) | (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Multiply the middle part (t4..t7) by 19. We truncate it to
+	 * 255 bits; the extra bits will go along with th.
+	 */
+	z = (unsigned __int128)t4 * 19;
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)t5 * 19 + (z >> 64);
+	t5 = (uint64_t)z;
+	z = (unsigned __int128)t6 * 19 + (z >> 64);
+	t6 = (uint64_t)z;
+	z = (unsigned __int128)t7 * 19 + (z >> 64);
+	t7 = (uint64_t)z & MASK63;
+
+	th = (361 & -th) + (19 * (uint64_t)(z >> 63));
+
+	/*
+	 * Add elements together.
+	 * At this point:
+	 *   t0..t3 fits on 255 bits.
+	 *   t4..t7 fits on 255 bits.
+	 *   th <= 361 + 342 = 703.
+	 */
+	z = (unsigned __int128)t0 + (unsigned __int128)t4
+		+ (unsigned __int128)th;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+	th = (uint64_t)(z >> 63);
+
+	/*
+	 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+	 * can only have value 0, 1 or 2. We just add th*19, which
+	 * guarantees a result of at most 2^255+37.
+	 */
+	z = (unsigned __int128)t0 + (19 * th);
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+	uint64_t h0, h1, h2, h3;
+	unsigned char k;
+
+	/*
+	 * Compute the product a*b over plain integers.
+	 */
+	t0 = _umul128(a[0], b[0], &h0);
+	t1 = _umul128(a[0], b[1], &h1);
+	k = _addcarry_u64(0, t1, h0, &t1);
+	t2 = _umul128(a[0], b[2], &h2);
+	k = _addcarry_u64(k, t2, h1, &t2);
+	t3 = _umul128(a[0], b[3], &h3);
+	k = _addcarry_u64(k, t3, h2, &t3);
+	(void)_addcarry_u64(k, h3, 0, &t4);
+
+	k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
+	k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
+	k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
+	t5 = k;
+	k = _addcarry_u64(0, t2, h0, &t2);
+	k = _addcarry_u64(k, t3, h1, &t3);
+	k = _addcarry_u64(k, t4, h2, &t4);
+	(void)_addcarry_u64(k, t5, h3, &t5);
+
+	k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
+	k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
+	k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
+	t6 = k;
+	k = _addcarry_u64(0, t3, h0, &t3);
+	k = _addcarry_u64(k, t4, h1, &t4);
+	k = _addcarry_u64(k, t5, h2, &t5);
+	(void)_addcarry_u64(k, t6, h3, &t6);
+
+	k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
+	k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
+	k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
+	t7 = k;
+	k = _addcarry_u64(0, t4, h0, &t4);
+	k = _addcarry_u64(k, t5, h1, &t5);
+	k = _addcarry_u64(k, t6, h2, &t6);
+	(void)_addcarry_u64(k, t7, h3, &t7);
+
+	/*
+	 * Modulo p, we have:
+	 *
+	 *   2^255 = 19
+	 *   2^510 = 19*19 = 361
+	 *
+	 * We split the intermediate t into three parts, in basis
+	 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+	 * The upper one can only be a single bit (th), since the
+	 * multiplication operands are at most 2^255+37 each.
+	 */
+	th = t7 >> 62;
+	t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+	t6 = (t6 << 1) | (t5 >> 63);
+	t5 = (t5 << 1) | (t4 >> 63);
+	t4 = (t4 << 1) | (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Multiply the middle part (t4..t7) by 19. We truncate it to
+	 * 255 bits; the extra bits will go along with th.
+	 */
+	t4 = _umul128(t4, 19, &h0);
+	t5 = _umul128(t5, 19, &h1);
+	t6 = _umul128(t6, 19, &h2);
+	t7 = _umul128(t7, 19, &h3);
+	k = _addcarry_u64(0, t5, h0, &t5);
+	k = _addcarry_u64(k, t6, h1, &t6);
+	k = _addcarry_u64(k, t7, h2, &t7);
+	(void)_addcarry_u64(k, h3, 0, &h3);
+	th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
+	t7 &= MASK63;
+
+	/*
+	 * Add elements together.
+	 * At this point:
+	 *   t0..t3 fits on 255 bits.
+	 *   t4..t7 fits on 255 bits.
+	 *   th <= 361 + 342 = 703.
+	 */
+	k = _addcarry_u64(0, t0, t4, &t0);
+	k = _addcarry_u64(k, t1, t5, &t1);
+	k = _addcarry_u64(k, t2, t6, &t2);
+	k = _addcarry_u64(k, t3, t7, &t3);
+	t4 = k;
+	k = _addcarry_u64(0, t0, th, &t0);
+	k = _addcarry_u64(k, t1, 0, &t1);
+	k = _addcarry_u64(k, t2, 0, &t2);
+	k = _addcarry_u64(k, t3, 0, &t3);
+	(void)_addcarry_u64(k, t4, 0, &t4);
+
+	th = (t4 << 1) + (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+	 * can only have value 0, 1 or 2. We just add th*19, which
+	 * guarantees a result of at most 2^255+37.
+	 */
+	k = _addcarry_u64(0, t0, 19 * th, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] * 121665;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * 121665 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * 121665 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * 121665 + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+
+	z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	t2 = (uint64_t)z;
+	t3 = t3 + (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
+	unsigned char k;
+
+	t0 = _umul128(a[0], 121665, &h0);
+	t1 = _umul128(a[1], 121665, &h1);
+	k = _addcarry_u64(0, t1, h0, &t1);
+	t2 = _umul128(a[2], 121665, &h2);
+	k = _addcarry_u64(k, t2, h1, &t2);
+	t3 = _umul128(a[3], 121665, &h3);
+	k = _addcarry_u64(k, t3, h2, &t3);
+	(void)_addcarry_u64(k, h3, 0, &t4);
+
+	t4 = (t4 << 1) + (t3 >> 63);
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, 19 * t4, &t0);
+	k = _addcarry_u64(k, t1, 0, &t1);
+	k = _addcarry_u64(k, t2, 0, &t2);
+	(void)_addcarry_u64(k, t3, 0, &t3);
+
+	t4 = 19 & -(t3 >> 63);
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, t4, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Finalize reduction.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned __int128 z;
+
+	/*
+	 * We add 19. If the result (in t) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	z = (unsigned __int128)a[0] + 19;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	t3 = a[3] + (uint64_t)(z >> 64);
+
+	m = -(t3 >> 63);
+	t3 &= MASK63;
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned char k;
+
+	/*
+	 * We add 19. If the result (in t) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	k = _addcarry_u64(0, a[0], 19, &t0);
+	k = _addcarry_u64(k, a[1], 0, &t1);
+	k = _addcarry_u64(k, a[2], 0, &t2);
+	(void)_addcarry_u64(k, a[3], 0, &t3);
+
+	m = -(t3 >> 63);
+	t3 &= MASK63;
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	unsigned char k[32];
+	uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+
+	/*
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	x1[0] = br_dec64le(&G[ 0]);
+	x1[1] = br_dec64le(&G[ 8]);
+	x1[2] = br_dec64le(&G[16]);
+	x1[3] = br_dec64le(&G[24]) & MASK63;
+
+	/*
+	 * We can use memset() to clear values, because exact-width types
+	 * like uint64_t are guaranteed to have no padding bits or
+	 * trap representations.
+	 */
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z2, 0, sizeof z2);
+	memcpy(x3, x1, sizeof x1);
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * The multiplier is provided in big-endian notation, and
+	 * possibly shorter than 32 bytes.
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	swap = 0;
+
+	for (i = 254; i >= 0; i --) {
+		uint64_t a[4], aa[4], b[4], bb[4], e[4];
+		uint64_t c[4], d[4], da[4], cb[4];
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		f255_cswap(x2, x3, swap);
+		f255_cswap(z2, z3, swap);
+		swap = kt;
+
+		/* A = x_2 + z_2 */
+		f255_add(a, x2, z2);
+
+		/* AA = A^2 */
+		f255_mul(aa, a, a);
+
+		/* B = x_2 - z_2 */
+		f255_sub(b, x2, z2);
+
+		/* BB = B^2 */
+		f255_mul(bb, b, b);
+
+		/* E = AA - BB */
+		f255_sub(e, aa, bb);
+
+		/* C = x_3 + z_3 */
+		f255_add(c, x3, z3);
+
+		/* D = x_3 - z_3 */
+		f255_sub(d, x3, z3);
+
+		/* DA = D * A */
+		f255_mul(da, d, a);
+
+		/* CB = C * B */
+		f255_mul(cb, c, b);
+
+		/* x_3 = (DA + CB)^2 */
+		f255_add(x3, da, cb);
+		f255_mul(x3, x3, x3);
+
+		/* z_3 = x_1 * (DA - CB)^2 */
+		f255_sub(z3, da, cb);
+		f255_mul(z3, z3, z3);
+		f255_mul(z3, x1, z3);
+
+		/* x_2 = AA * BB */
+		f255_mul(x2, aa, bb);
+
+		/* z_2 = E * (AA + a24 * E) */
+		f255_mul_a24(z2, e);
+		f255_add(z2, aa, z2);
+		f255_mul(z2, e, z2);
+	}
+
+	f255_cswap(x2, x3, swap);
+	f255_cswap(z2, z3, swap);
+
+	/*
+	 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+	 * most non-squarings. We use x1 and x3, now useless, as temporaries.
+	 */
+	memcpy(x1, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_mul(x1, x1, x1);
+		f255_mul(x1, x1, z2);
+	}
+	memcpy(x3, x1, sizeof x1);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_mul(x3, x3, x3);
+		}
+		f255_mul(x3, x3, x1);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_mul(x3, x3, x3);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(x3, z2, x3);
+		}
+	}
+
+	/*
+	 * Compute x2/z2. We have 1/z2 in x3.
+	 */
+	f255_mul(x2, x2, x3);
+	f255_final_reduce(x2);
+
+	/*
+	 * Encode the final x2 value in little-endian.
+	 */
+	br_enc64le(G,      x2[0]);
+	br_enc64le(G +  8, x2[1]);
+	br_enc64le(G + 16, x2[2]);
+	br_enc64le(G + 24, x2[3]);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m64 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+	return &br_ec_c25519_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/bearssl/src/ec/ec_p256_m15.c b/src/bearssl/src/ec/ec_p256_m15.c
index 6ce57e0..8d68d1d 100644
--- a/src/bearssl/src/ec/ec_p256_m15.c
+++ b/src/bearssl/src/ec/ec_p256_m15.c
@@ -1739,7 +1739,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
 	memcpy(P->y, ty, sizeof ty);
 	memset(P->z, 0, sizeof P->z);
 	P->z[0] = 1;
-	return NEQ(bad, 0) ^ 1;
+	return EQ(bad, 0);
 }
 
 /*
diff --git a/src/bearssl/src/ec/ec_p256_m31.c b/src/bearssl/src/ec/ec_p256_m31.c
index ec22c3e..d57ef7b 100644
--- a/src/bearssl/src/ec/ec_p256_m31.c
+++ b/src/bearssl/src/ec/ec_p256_m31.c
@@ -1089,7 +1089,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
 	memcpy(P->y, ty, sizeof ty);
 	memset(P->z, 0, sizeof P->z);
 	P->z[0] = 1;
-	return NEQ(bad, 0) ^ 1;
+	return EQ(bad, 0);
 }
 
 /*
diff --git a/src/bearssl/src/ec/ec_p256_m62.c b/src/bearssl/src/ec/ec_p256_m62.c
new file mode 100644
index 0000000..a431790
--- /dev/null
+++ b/src/bearssl/src/ec/ec_p256_m62.c
@@ -0,0 +1,1765 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+	0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+	0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_G;
+	return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_N;
+	return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 1;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^52.
+ * Limbs may occasionally exceed 2^52.
+ *
+ * A _partially reduced_ value is such that the following hold:
+ *   - top limb is less than 2^48 + 2^30
+ *   - the other limbs fit on 53 bits each
+ * In particular, such a value is less than twice the modulus p.
+ */
+
+#define BIT(n)   ((uint64_t)1 << (n))
+#define MASK48   (BIT(48) - BIT(0))
+#define MASK52   (BIT(52) - BIT(0))
+
+/* R = 2^260 mod p */
+static const uint64_t F256_R[] = {
+	0x0000000000010, 0xF000000000000, 0xFFFFFFFFFFFFF,
+	0xFFEFFFFFFFFFF, 0x00000000FFFFF
+};
+
+/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
+   (Montgomery representation of B). */
+static const uint64_t P256_B_MONTY[] = {
+	0xDF6229C4BDDFD, 0xCA8843090D89C, 0x212ED6ACF005C,
+	0x83415A220ABF7, 0x0C30061DD4874
+};
+
+/*
+ * Addition in the field. Carry propagation is not performed.
+ * On input, limbs may be up to 63 bits each; on output, they will
+ * be up to one bit more than on input.
+ */
+static inline void
+f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	d[0] = a[0] + b[0];
+	d[1] = a[1] + b[1];
+	d[2] = a[2] + b[2];
+	d[3] = a[3] + b[3];
+	d[4] = a[4] + b[4];
+}
+
+/*
+ * Partially reduce the provided value.
+ * Input: limbs can go up to 61 bits each.
+ * Output: partially reduced.
+ */
+static inline void
+f256_partial_reduce(uint64_t *a)
+{
+	uint64_t w, cc, s;
+
+	/*
+	 * Propagate carries.
+	 */
+	w = a[0];
+	a[0] = w & MASK52;
+	cc = w >> 52;
+	w = a[1] + cc;
+	a[1] = w & MASK52;
+	cc = w >> 52;
+	w = a[2] + cc;
+	a[2] = w & MASK52;
+	cc = w >> 52;
+	w = a[3] + cc;
+	a[3] = w & MASK52;
+	cc = w >> 52;
+	a[4] += cc;
+
+	s = a[4] >> 48;             /* s < 2^14 */
+	a[0] += s;                  /* a[0] < 2^52 + 2^14 */
+	w = a[1] - (s << 44);
+	a[1] = w & MASK52;          /* a[1] < 2^52 */
+	cc = -(w >> 52) & 0xFFF;    /* cc < 16 */
+	w = a[2] - cc;
+	a[2] = w & MASK52;          /* a[2] < 2^52 */
+	cc = w >> 63;               /* cc = 0 or 1 */
+	w = a[3] - cc - (s << 36);
+	a[3] = w & MASK52;          /* a[3] < 2^52 */
+	cc = w >> 63;               /* cc = 0 or 1 */
+	w = a[4] & MASK48;
+	a[4] = w + (s << 16) - cc;  /* a[4] < 2^48 + 2^30 */
+}
+
+/*
+ * Subtraction in the field.
+ * Input: limbs must fit on 60 bits each; in particular, the complete
+ * integer will be less than 2^268 + 2^217.
+ * Output: partially reduced.
+ */
+static inline void
+f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	uint64_t t[5], w, s, cc;
+
+	/*
+	 * We compute d = 2^13*p + a - b; this ensures a positive
+	 * intermediate value.
+	 *
+	 * Each individual addition/subtraction may yield a positive or
+	 * negative result; thus, we need to handle a signed carry, thus
+	 * with sign extension. We prefer not to use signed types (int64_t)
+	 * because conversion from unsigned to signed is cumbersome (a
+	 * direct cast with the top bit set is undefined behavior; instead,
+	 * we have to use pointer aliasing, using the guaranteed properties
+	 * of exact-width types, but this requires the compiler to optimize
+	 * away the writes and reads from RAM), and right-shifting a
+	 * signed negative value is implementation-defined. Therefore,
+	 * we use a custom sign extension.
+	 */
+
+	w = a[0] - b[0] - BIT(13);
+	t[0] = w & MASK52;
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	w = a[1] - b[1] + cc;
+	t[1] = w & MASK52;
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	w = a[2] - b[2] + cc;
+	t[2] = (w & MASK52) + BIT(5);
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	w = a[3] - b[3] + cc;
+	t[3] = (w & MASK52) + BIT(49);
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	t[4] = (BIT(61) - BIT(29)) + a[4] - b[4] + cc;
+
+	/*
+	 * Perform partial reduction. Rule is:
+	 *  2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	 *
+	 * At that point:
+	 *    0 <= t[0] <= 2^52 - 1
+	 *    0 <= t[1] <= 2^52 - 1
+	 *    2^5 <= t[2] <= 2^52 + 2^5 - 1
+	 *    2^49 <= t[3] <= 2^52 + 2^49 - 1
+	 *    2^59 < t[4] <= 2^61 + 2^60 - 2^29
+	 *
+	 * Thus, the value 's' (t[4] / 2^48) will be necessarily
+	 * greater than 2048, and less than 12288.
+	 */
+	s = t[4] >> 48;
+
+	d[0] = t[0] + s;             /* d[0] <= 2^52 + 12287 */
+	w = t[1] - (s << 44);
+	d[1] = w & MASK52;           /* d[1] <= 2^52 - 1 */
+	cc = -(w >> 52) & 0xFFF;     /* cc <= 48 */
+	w = t[2] - cc;
+	cc = w >> 63;                /* cc = 0 or 1 */
+	d[2] = w + (cc << 52);       /* d[2] <= 2^52 + 31 */
+	w = t[3] - cc - (s << 36);
+	cc = w >> 63;                /* cc = 0 or 1 */
+	d[3] = w + (cc << 52);       /* t[3] <= 2^52 + 2^49 - 1 */
+	d[4] = (t[4] & MASK48) + (s << 16) - cc;  /* d[4] < 2^48 + 2^30 */
+
+	/*
+	 * If s = 0, then none of the limbs is modified, and there cannot
+	 * be an overflow; if s != 0, then (s << 16) > cc, and there is
+	 * no overflow either.
+	 */
+}
+
+/*
+ * Montgomery multiplication in the field.
+ * Input: limbs must fit on 56 bits each.
+ * Output: partially reduced.
+ */
+static void
+f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	int i;
+	uint64_t t[5];
+
+	t[0] = 0;
+	t[1] = 0;
+	t[2] = 0;
+	t[3] = 0;
+	t[4] = 0;
+	for (i = 0; i < 5; i ++) {
+		uint64_t x, f, cc, w, s;
+		unsigned __int128 z;
+
+		/*
+		 * Since limbs of a[] and b[] fit on 56 bits each,
+		 * each individual product fits on 112 bits. Also,
+		 * the factor f fits on 52 bits, so f<<48 fits on
+		 * 112 bits too. This guarantees that carries (cc)
+		 * will fit on 62 bits, thus no overflow.
+		 *
+		 * The operations below compute:
+		 *   t <- (t + x*b + f*p) / 2^64
+		 */
+		x = a[i];
+		z = (unsigned __int128)b[0] * (unsigned __int128)x
+			+ (unsigned __int128)t[0];
+		f = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[1] * (unsigned __int128)x
+			+ (unsigned __int128)t[1] + cc
+			+ ((unsigned __int128)f << 44);
+		t[0] = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[2] * (unsigned __int128)x
+			+ (unsigned __int128)t[2] + cc;
+		t[1] = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[3] * (unsigned __int128)x
+			+ (unsigned __int128)t[3] + cc
+			+ ((unsigned __int128)f << 36);
+		t[2] = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[4] * (unsigned __int128)x
+			+ (unsigned __int128)t[4] + cc
+			+ ((unsigned __int128)f << 48)
+			- ((unsigned __int128)f << 16);
+		t[3] = (uint64_t)z & MASK52;
+		t[4] = (uint64_t)(z >> 52);
+
+		/*
+		 * t[4] may be up to 62 bits here; we need to do a
+		 * partial reduction. Note that limbs t[0] to t[3]
+		 * fit on 52 bits each.
+		 */
+		s = t[4] >> 48;             /* s < 2^14 */
+		t[0] += s;                  /* t[0] < 2^52 + 2^14 */
+		w = t[1] - (s << 44);
+		t[1] = w & MASK52;          /* t[1] < 2^52 */
+		cc = -(w >> 52) & 0xFFF;    /* cc < 16 */
+		w = t[2] - cc;
+		t[2] = w & MASK52;          /* t[2] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[3] - cc - (s << 36);
+		t[3] = w & MASK52;          /* t[3] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[4] & MASK48;
+		t[4] = w + (s << 16) - cc;  /* t[4] < 2^48 + 2^30 */
+
+		/*
+		 * The final t[4] cannot overflow because cc is 0 or 1,
+		 * and cc can be 1 only if s != 0.
+		 */
+	}
+
+	d[0] = t[0];
+	d[1] = t[1];
+	d[2] = t[2];
+	d[3] = t[3];
+	d[4] = t[4];
+
+#elif BR_UMUL128
+
+	int i;
+	uint64_t t[5];
+
+	t[0] = 0;
+	t[1] = 0;
+	t[2] = 0;
+	t[3] = 0;
+	t[4] = 0;
+	for (i = 0; i < 5; i ++) {
+		uint64_t x, f, cc, w, s, zh, zl;
+		unsigned char k;
+
+		/*
+		 * Since limbs of a[] and b[] fit on 56 bits each,
+		 * each individual product fits on 112 bits. Also,
+		 * the factor f fits on 52 bits, so f<<48 fits on
+		 * 112 bits too. This guarantees that carries (cc)
+		 * will fit on 62 bits, thus no overflow.
+		 *
+		 * The operations below compute:
+		 *   t <- (t + x*b + f*p) / 2^64
+		 */
+		x = a[i];
+		zl = _umul128(b[0], x, &zh);
+		k = _addcarry_u64(0, t[0], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		f = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[1], x, &zh);
+		k = _addcarry_u64(0, t[1], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, f << 44, zl, &zl);
+		(void)_addcarry_u64(k, f >> 20, zh, &zh);
+		t[0] = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[2], x, &zh);
+		k = _addcarry_u64(0, t[2], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		t[1] = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[3], x, &zh);
+		k = _addcarry_u64(0, t[3], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, f << 36, zl, &zl);
+		(void)_addcarry_u64(k, f >> 28, zh, &zh);
+		t[2] = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[4], x, &zh);
+		k = _addcarry_u64(0, t[4], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, f << 48, zl, &zl);
+		(void)_addcarry_u64(k, f >> 16, zh, &zh);
+		k = _subborrow_u64(0, zl, f << 16, &zl);
+		(void)_subborrow_u64(k, zh, f >> 48, &zh);
+		t[3] = zl & MASK52;
+		t[4] = (zl >> 52) | (zh << 12);
+
+		/*
+		 * t[4] may be up to 62 bits here; we need to do a
+		 * partial reduction. Note that limbs t[0] to t[3]
+		 * fit on 52 bits each.
+		 */
+		s = t[4] >> 48;             /* s < 2^14 */
+		t[0] += s;                  /* t[0] < 2^52 + 2^14 */
+		w = t[1] - (s << 44);
+		t[1] = w & MASK52;          /* t[1] < 2^52 */
+		cc = -(w >> 52) & 0xFFF;    /* cc < 16 */
+		w = t[2] - cc;
+		t[2] = w & MASK52;          /* t[2] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[3] - cc - (s << 36);
+		t[3] = w & MASK52;          /* t[3] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[4] & MASK48;
+		t[4] = w + (s << 16) - cc;  /* t[4] < 2^48 + 2^30 */
+
+		/*
+		 * The final t[4] cannot overflow because cc is 0 or 1,
+		 * and cc can be 1 only if s != 0.
+		 */
+	}
+
+	d[0] = t[0];
+	d[1] = t[1];
+	d[2] = t[2];
+	d[3] = t[3];
+	d[4] = t[4];
+
+#endif
+}
+
+/*
+ * Montgomery squaring in the field; currently a basic wrapper around
+ * multiplication (inline, should be optimized away).
+ * TODO: see if some extra speed can be gained here.
+ */
+static inline void
+f256_montysquare(uint64_t *d, const uint64_t *a)
+{
+	f256_montymul(d, a, a);
+}
+
+/*
+ * Convert to Montgomery representation.
+ */
+static void
+f256_tomonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * R2 = 2^520 mod p.
+	 * If R = 2^260 mod p, then R2 = R^2 mod p; and the Montgomery
+	 * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
+	 * conversion to Montgomery representation.
+	 */
+	static const uint64_t R2[] = {
+		0x0000000000300, 0xFFFFFFFF00000, 0xFFFFEFFFFFFFB,
+		0xFDFFFFFFFFFFF, 0x0000004FFFFFF
+	};
+
+	f256_montymul(d, a, R2);
+}
+
+/*
+ * Convert from Montgomery representation.
+ */
+static void
+f256_frommonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * Montgomery multiplication by 1 is division by 2^260 modulo p.
+	 */
+	static const uint64_t one[] = { 1, 0, 0, 0, 0 };
+
+	f256_montymul(d, a, one);
+}
+
+/*
+ * Inversion in the field. If the source value is 0 modulo p, then this
+ * returns 0 or p. This function uses Montgomery representation.
+ */
+static void
+f256_invert(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * We compute a^(p-2) mod p. The exponent pattern (from high to
+	 * low) is:
+	 *  - 32 bits of value 1
+	 *  - 31 bits of value 0
+	 *  - 1 bit of value 1
+	 *  - 96 bits of value 0
+	 *  - 94 bits of value 1
+	 *  - 1 bit of value 0
+	 *  - 1 bit of value 1
+	 * To speed up the square-and-multiply algorithm, we precompute
+	 * a^(2^31-1).
+	 */
+
+	uint64_t r[5], t[5];
+	int i;
+
+	memcpy(t, a, sizeof t);
+	for (i = 0; i < 30; i ++) {
+		f256_montysquare(t, t);
+		f256_montymul(t, t, a);
+	}
+
+	memcpy(r, t, sizeof t);
+	for (i = 224; i >= 0; i --) {
+		f256_montysquare(r, r);
+		switch (i) {
+		case 0:
+		case 2:
+		case 192:
+		case 224:
+			f256_montymul(r, r, a);
+			break;
+		case 3:
+		case 34:
+		case 65:
+			f256_montymul(r, r, t);
+			break;
+		}
+	}
+	memcpy(d, r, sizeof r);
+}
+
+/*
+ * Finalize reduction.
+ * Input value should be partially reduced.
+ * On output, limbs a[0] to a[3] fit on 52 bits each, limb a[4] fits
+ * on 48 bits, and the integer is less than p.
+ */
+static inline void
+f256_final_reduce(uint64_t *a)
+{
+	uint64_t r[5], t[5], w, cc;
+	int i;
+
+	/*
+	 * Propagate carries to ensure that limbs 0 to 3 fit on 52 bits.
+	 */
+	cc = 0;
+	for (i = 0; i < 5; i ++) {
+		w = a[i] + cc;
+		r[i] = w & MASK52;
+		cc = w >> 52;
+	}
+
+	/*
+	 * We compute t = r + (2^256 - p) = r + 2^224 - 2^192 - 2^96 + 1.
+	 * If t < 2^256, then r < p, and we return r. Otherwise, we
+	 * want to return r - p = t - 2^256.
+	 */
+
+	/*
+	 * Add 2^224 + 1, and propagate carries to ensure that limbs
+	 * t[0] to t[3] fit in 52 bits each.
+	 */
+	w = r[0] + 1;
+	t[0] = w & MASK52;
+	cc = w >> 52;
+	w = r[1] + cc;
+	t[1] = w & MASK52;
+	cc = w >> 52;
+	w = r[2] + cc;
+	t[2] = w & MASK52;
+	cc = w >> 52;
+	w = r[3] + cc;
+	t[3] = w & MASK52;
+	cc = w >> 52;
+	t[4] = r[4] + cc + BIT(16);
+
+	/*
+	 * Subtract 2^192 + 2^96. Since we just added 2^224 + 1, the
+	 * result cannot be negative.
+	 */
+	w = t[1] - BIT(44);
+	t[1] = w & MASK52;
+	cc = w >> 63;
+	w = t[2] - cc;
+	t[2] = w & MASK52;
+	cc = w >> 63;
+	w = t[3] - BIT(36) - cc;
+	t[3] = w & MASK52;
+	cc = w >> 63;
+	t[4] -= cc;
+
+	/*
+	 * If the top limb t[4] fits on 48 bits, then r[] is already
+	 * in the proper range. Otherwise, t[] is the value to return
+	 * (truncated to 256 bits).
+	 */
+	cc = -(t[4] >> 48);
+	t[4] &= MASK48;
+	for (i = 0; i < 5; i ++) {
+		a[i] = r[i] ^ (cc & (r[i] ^ t[i]));
+	}
+}
+
+/*
+ * Points in affine and Jacobian coordinates.
+ *
+ *  - In affine coordinates, the point-at-infinity cannot be encoded.
+ *  - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
+ *    if Z = 0 then this is the point-at-infinity.
+ */
+typedef struct {
+	uint64_t x[5];
+	uint64_t y[5];
+} p256_affine;
+
+typedef struct {
+	uint64_t x[5];
+	uint64_t y[5];
+	uint64_t z[5];
+} p256_jacobian;
+
+/*
+ * Decode a field element (unsigned big endian notation).
+ */
+static void
+f256_decode(uint64_t *a, const unsigned char *buf)
+{
+	uint64_t w0, w1, w2, w3;
+
+	w3 = br_dec64be(buf +  0);
+	w2 = br_dec64be(buf +  8);
+	w1 = br_dec64be(buf + 16);
+	w0 = br_dec64be(buf + 24);
+	a[0] = w0 & MASK52;
+	a[1] = ((w0 >> 52) | (w1 << 12)) & MASK52;
+	a[2] = ((w1 >> 40) | (w2 << 24)) & MASK52;
+	a[3] = ((w2 >> 28) | (w3 << 36)) & MASK52;
+	a[4] = w3 >> 16;
+}
+
+/*
+ * Encode a field element (unsigned big endian notation). The field
+ * element MUST be fully reduced.
+ */
+static void
+f256_encode(unsigned char *buf, const uint64_t *a)
+{
+	uint64_t w0, w1, w2, w3;
+
+	w0 = a[0] | (a[1] << 52);
+	w1 = (a[1] >> 12) | (a[2] << 40);
+	w2 = (a[2] >> 24) | (a[3] << 28);
+	w3 = (a[3] >> 36) | (a[4] << 16);
+	br_enc64be(buf +  0, w3);
+	br_enc64be(buf +  8, w2);
+	br_enc64be(buf + 16, w1);
+	br_enc64be(buf + 24, w0);
+}
+
+/*
+ * Decode a point. The returned point is in Jacobian coordinates, but
+ * with z = 1. If the encoding is invalid, or encodes a point which is
+ * not on the curve, or encodes the point at infinity, then this function
+ * returns 0. Otherwise, 1 is returned.
+ *
+ * The buffer is assumed to have length exactly 65 bytes.
+ */
+static uint32_t
+point_decode(p256_jacobian *P, const unsigned char *buf)
+{
+	uint64_t x[5], y[5], t[5], x3[5], tt;
+	uint32_t r;
+
+	/*
+	 * Header byte shall be 0x04.
+	 */
+	r = EQ(buf[0], 0x04);
+
+	/*
+	 * Decode X and Y coordinates, and convert them into
+	 * Montgomery representation.
+	 */
+	f256_decode(x, buf +  1);
+	f256_decode(y, buf + 33);
+	f256_tomonty(x, x);
+	f256_tomonty(y, y);
+
+	/*
+	 * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
+	 * Note that the Montgomery representation of 0 is 0. We must
+	 * take care to apply the final reduction to make sure we have
+	 * 0 and not p.
+	 */
+	f256_montysquare(t, y);
+	f256_montysquare(x3, x);
+	f256_montymul(x3, x3, x);
+	f256_sub(t, t, x3);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_sub(t, t, P256_B_MONTY);
+	f256_final_reduce(t);
+	tt = t[0] | t[1] | t[2] | t[3] | t[4];
+	r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
+
+	/*
+	 * Return the point in Jacobian coordinates (and Montgomery
+	 * representation).
+	 */
+	memcpy(P->x, x, sizeof x);
+	memcpy(P->y, y, sizeof y);
+	memcpy(P->z, F256_R, sizeof F256_R);
+	return r;
+}
+
+/*
+ * Final conversion for a point:
+ *  - The point is converted back to affine coordinates.
+ *  - Final reduction is performed.
+ *  - The point is encoded into the provided buffer.
+ *
+ * If the point is the point-at-infinity, all operations are performed,
+ * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
+ * the encoded point is written in the buffer, and 1 is returned.
+ */
+static uint32_t
+point_encode(unsigned char *buf, const p256_jacobian *P)
+{
+	uint64_t t1[5], t2[5], z;
+
+	/* Set t1 = 1/z^2 and t2 = 1/z^3. */
+	f256_invert(t2, P->z);
+	f256_montysquare(t1, t2);
+	f256_montymul(t2, t2, t1);
+
+	/* Compute affine coordinates x (in t1) and y (in t2). */
+	f256_montymul(t1, P->x, t1);
+	f256_montymul(t2, P->y, t2);
+
+	/* Convert back from Montgomery representation, and finalize
+	   reductions. */
+	f256_frommonty(t1, t1);
+	f256_frommonty(t2, t2);
+	f256_final_reduce(t1);
+	f256_final_reduce(t2);
+
+	/* Encode. */
+	buf[0] = 0x04;
+	f256_encode(buf +  1, t1);
+	f256_encode(buf + 33, t2);
+
+	/* Return success if and only if P->z != 0. */
+	z = P->z[0] | P->z[1] | P->z[2] | P->z[3] | P->z[4];
+	return NEQ((uint32_t)(z | z >> 32), 0);
+}
+
+/*
+ * Point doubling in Jacobian coordinates: point P is doubled.
+ * Note: if the source point is the point-at-infinity, then the result is
+ * still the point-at-infinity, which is correct. Moreover, if the three
+ * coordinates were zero, then they still are zero in the returned value.
+ */
+static void
+p256_double(p256_jacobian *P)
+{
+	/*
+	 * Doubling formulas are:
+	 *
+	 *   s = 4*x*y^2
+	 *   m = 3*(x + z^2)*(x - z^2)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y^4
+	 *   z' = 2*y*z
+	 *
+	 * These formulas work for all points, including points of order 2
+	 * and points at infinity:
+	 *   - If y = 0 then z' = 0. But there is no such point in P-256
+	 *     anyway.
+	 *   - If z = 0 then z' = 0.
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5];
+
+	/*
+	 * Compute z^2 in t1.
+	 */
+	f256_montysquare(t1, P->z);
+
+	/*
+	 * Compute x-z^2 in t2 and x+z^2 in t1.
+	 */
+	f256_add(t2, P->x, t1);
+	f256_sub(t1, P->x, t1);
+
+	/*
+	 * Compute 3*(x+z^2)*(x-z^2) in t1.
+	 */
+	f256_montymul(t3, t1, t2);
+	f256_add(t1, t3, t3);
+	f256_add(t1, t3, t1);
+
+	/*
+	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	f256_montysquare(t3, P->y);
+	f256_add(t3, t3, t3);
+	f256_montymul(t2, P->x, t3);
+	f256_add(t2, t2, t2);
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	f256_montysquare(P->x, t1);
+	f256_sub(P->x, P->x, t2);
+	f256_sub(P->x, P->x, t2);
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	f256_montymul(t4, P->y, P->z);
+	f256_add(P->z, t4, t4);
+	f256_partial_reduce(P->z);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	f256_sub(t2, t2, P->x);
+	f256_montymul(P->y, t1, t2);
+	f256_montysquare(t4, t3);
+	f256_add(t4, t4, t4);
+	f256_sub(P->y, P->y, t4);
+}
+
+/*
+ * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0 but P2 != 0
+ *   - If P1 != 0 but P2 == 0
+ *   - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate.
+ *   - P1 == 0 and P2 == 0.
+ *   - The Y coordinate of one of the points is 0 and the other point is
+ *     the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Note that you can get a returned value of 0 with a correct result,
+ * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1 * z2^2
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1 * z2^3
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1 * z2
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	f256_montysquare(t3, P2->z);
+	f256_montymul(t1, P1->x, t3);
+	f256_montymul(t4, P2->z, t3);
+	f256_montymul(t3, P1->y, t4);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(t1, P1->z, P2->z);
+	f256_montymul(P1->z, t1, t2);
+
+	return ret;
+}
+
+/*
+ * Point addition (mixed coordinates): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0
+ *   - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y (affine) coordinate.
+ *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Again, a value of 0 may be returned in some cases where the addition
+ * result is correct.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	return ret;
+}
+
+#if 0
+/* unused */
+/*
+ * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function returns the correct result in all cases.
+ */
+static uint32_t
+p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas, in the general case, are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 *
+	 * These formulas mishandle the two following cases:
+	 *
+	 *  - If P1 is the point-at-infinity (z1 = 0), then z3 is
+	 *    incorrectly set to 0.
+	 *
+	 *  - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
+	 *    are all set to 0.
+	 *
+	 * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
+	 * we correctly get z3 = 0 (the point-at-infinity).
+	 *
+	 * To fix the case P1 = 0, we perform at the end a copy of P2
+	 * over P1, conditional to z1 = 0.
+	 *
+	 * For P1 = P2: in that case, both h and r are set to 0, and
+	 * we get x3, y3 and z3 equal to 0. We can test for that
+	 * occurrence to make a mask which will be all-one if P1 = P2,
+	 * or all-zero otherwise; then we can compute the double of P2
+	 * and add it, combined with the mask, to (x3,y3,z3).
+	 *
+	 * Using the doubling formulas in p256_double() on (x2,y2),
+	 * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
+	 * we get:
+	 *   s = 4*x2*y2^2
+	 *   m = 3*(x2 + 1)*(x2 - 1)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y2^4
+	 *   z' = 2*y2
+	 * which requires only 6 multiplications. Added to the 11
+	 * multiplications of the normal mixed addition in Jacobian
+	 * coordinates, we get a cost of 17 multiplications in total.
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt, zz;
+	int i;
+
+	/*
+	 * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
+	 */
+	zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3] | P1->z[4];
+	zz = ((zz | -zz) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+
+	/*
+	 * If both h = 0 and r = 0, then P1 = P2, and we want to set
+	 * the mask tt to -1; otherwise, the mask will be 0.
+	 */
+	f256_final_reduce(t2);
+	f256_final_reduce(t4);
+	tt = t2[0] | t2[1] | t2[2] | t2[3] | t2[4]
+		| t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+	tt = ((tt | -tt) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	/*
+	 * The "double" result, in case P1 = P2.
+	 */
+
+	/*
+	 * Compute z' = 2*y2 (in t1).
+	 */
+	f256_add(t1, P2->y, P2->y);
+	f256_partial_reduce(t1);
+
+	/*
+	 * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
+	 */
+	f256_montysquare(t2, P2->y);
+	f256_add(t2, t2, t2);
+	f256_add(t3, t2, t2);
+	f256_montymul(t3, P2->x, t3);
+
+	/*
+	 * Compute m = 3*(x2^2 - 1) (in t4).
+	 */
+	f256_montysquare(t4, P2->x);
+	f256_sub(t4, t4, F256_R);
+	f256_add(t5, t4, t4);
+	f256_add(t4, t4, t5);
+
+	/*
+	 * Compute x' = m^2 - 2*s (in t5).
+	 */
+	f256_montysquare(t5, t4);
+	f256_sub(t5, t3);
+	f256_sub(t5, t3);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y2^4 (in t6).
+	 */
+	f256_sub(t6, t3, t5);
+	f256_montymul(t6, t6, t4);
+	f256_montysquare(t7, t2);
+	f256_sub(t6, t6, t7);
+	f256_sub(t6, t6, t7);
+
+	/*
+	 * We now have the alternate (doubling) coordinates in (t5,t6,t1).
+	 * We combine them with (x3,y3,z3).
+	 */
+	for (i = 0; i < 5; i ++) {
+		P1->x[i] |= tt & t5[i];
+		P1->y[i] |= tt & t6[i];
+		P1->z[i] |= tt & t1[i];
+	}
+
+	/*
+	 * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
+	 * then we want to replace the result with a copy of P2. The
+	 * test on z1 was done at the start, in the zz mask.
+	 */
+	for (i = 0; i < 5; i ++) {
+		P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
+		P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
+		P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
+	}
+}
+#endif
+
+/*
+ * Inner function for computing a point multiplication. A window is
+ * provided, with points 1*P to 15*P in affine coordinates.
+ *
+ * Assumptions:
+ *  - All provided points are valid points on the curve.
+ *  - Multiplier is non-zero, and smaller than the curve order.
+ *  - Everything is in Montgomery representation.
+ */
+static void
+point_mul_inner(p256_jacobian *R, const p256_affine *W,
+	const unsigned char *k, size_t klen)
+{
+	p256_jacobian Q;
+	uint32_t qz;
+
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (klen -- > 0) {
+		int i;
+		unsigned bk;
+
+		bk = *k ++;
+		for (i = 0; i < 2; i ++) {
+			uint32_t bits;
+			uint32_t bnz;
+			p256_affine T;
+			p256_jacobian U;
+			uint32_t n;
+			int j;
+			uint64_t m;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			bits = (bk >> 4) & 0x0F;
+			bnz = NEQ(bits, 0);
+
+			/*
+			 * Lookup point in window. If the bits are 0,
+			 * we get something invalid, which is not a
+			 * problem because we will use it only if the
+			 * bits are non-zero.
+			 */
+			memset(&T, 0, sizeof T);
+			for (n = 0; n < 15; n ++) {
+				m = -(uint64_t)EQ(bits, n + 1);
+				T.x[0] |= m & W[n].x[0];
+				T.x[1] |= m & W[n].x[1];
+				T.x[2] |= m & W[n].x[2];
+				T.x[3] |= m & W[n].x[3];
+				T.x[4] |= m & W[n].x[4];
+				T.y[0] |= m & W[n].y[0];
+				T.y[1] |= m & W[n].y[1];
+				T.y[2] |= m & W[n].y[2];
+				T.y[3] |= m & W[n].y[3];
+				T.y[4] |= m & W[n].y[4];
+			}
+
+			U = Q;
+			p256_add_mixed(&U, &T);
+
+			/*
+			 * If qz is still 1, then Q was all-zeros, and this
+			 * is conserved through p256_double().
+			 */
+			m = -(uint64_t)(bnz & qz);
+			for (j = 0; j < 5; j ++) {
+				Q.x[j] ^= m & (Q.x[j] ^ T.x[j]);
+				Q.y[j] ^= m & (Q.y[j] ^ T.y[j]);
+				Q.z[j] ^= m & (Q.z[j] ^ F256_R[j]);
+			}
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+			bk <<= 4;
+		}
+	}
+	*R = Q;
+}
+
+/*
+ * Convert a window from Jacobian to affine coordinates. A single
+ * field inversion is used. This function works for windows up to
+ * 32 elements.
+ *
+ * The destination array (aff[]) and the source array (jac[]) may
+ * overlap, provided that the start of aff[] is not after the start of
+ * jac[]. Even if the arrays do _not_ overlap, the source array is
+ * modified.
+ */
+static void
+window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
+{
+	/*
+	 * Convert the window points to affine coordinates. We use the
+	 * following trick to mutualize the inversion computation: if
+	 * we have z1, z2, z3, and z4, and want to invert all of them,
+	 * we compute u = 1/(z1*z2*z3*z4), and then we have:
+	 *   1/z1 = u*z2*z3*z4
+	 *   1/z2 = u*z1*z3*z4
+	 *   1/z3 = u*z1*z2*z4
+	 *   1/z4 = u*z1*z2*z3
+	 *
+	 * The partial products are computed recursively:
+	 *
+	 *  - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
+	 *  - on input (z_1,z_2,... z_n):
+	 *       recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
+	 *       recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
+	 *       multiply elements of r1 by m2 -> s1
+	 *       multiply elements of r2 by m1 -> s2
+	 *       return r1||r2 and m1*m2
+	 *
+	 * In the example below, we suppose that we have 14 elements.
+	 * Let z1, z2,... zE be the 14 values to invert (index noted in
+	 * hexadecimal, starting at 1).
+	 *
+	 *  - Depth 1:
+	 *      swap(z1, z2); z12 = z1*z2
+	 *      swap(z3, z4); z34 = z3*z4
+	 *      swap(z5, z6); z56 = z5*z6
+	 *      swap(z7, z8); z78 = z7*z8
+	 *      swap(z9, zA); z9A = z9*zA
+	 *      swap(zB, zC); zBC = zB*zC
+	 *      swap(zD, zE); zDE = zD*zE
+	 *
+	 *  - Depth 2:
+	 *      z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
+	 *      z1234 = z12*z34
+	 *      z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
+	 *      z5678 = z56*z78
+	 *      z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
+	 *      z9ABC = z9A*zBC
+	 *
+	 *  - Depth 3:
+	 *      z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
+	 *      z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
+	 *      z12345678 = z1234*z5678
+	 *      z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
+	 *      zD <- zD*z9ABC, zE*z9ABC
+	 *      z9ABCDE = z9ABC*zDE
+	 *
+	 *  - Depth 4:
+	 *      multiply z1..z8 by z9ABCDE
+	 *      multiply z9..zE by z12345678
+	 *      final z = z12345678*z9ABCDE
+	 */
+
+	uint64_t z[16][5];
+	int i, k, s;
+#define zt   (z[15])
+#define zu   (z[14])
+#define zv   (z[13])
+
+	/*
+	 * First recursion step (pairwise swapping and multiplication).
+	 * If there is an odd number of elements, then we "invent" an
+	 * extra one with coordinate Z = 1 (in Montgomery representation).
+	 */
+	for (i = 0; (i + 1) < num; i += 2) {
+		memcpy(zt, jac[i].z, sizeof zt);
+		memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
+		memcpy(jac[i + 1].z, zt, sizeof zt);
+		f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
+	}
+	if ((num & 1) != 0) {
+		memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
+		memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
+	}
+
+	/*
+	 * Perform further recursion steps. At the entry of each step,
+	 * the process has been done for groups of 's' points. The
+	 * integer k is the log2 of s.
+	 */
+	for (k = 1, s = 2; s < num; k ++, s <<= 1) {
+		int n;
+
+		for (i = 0; i < num; i ++) {
+			f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
+		}
+		n = (num + s - 1) >> k;
+		for (i = 0; i < (n >> 1); i ++) {
+			f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
+		}
+		if ((n & 1) != 0) {
+			memmove(z[n >> 1], z[n], sizeof zt);
+		}
+	}
+
+	/*
+	 * Invert the final result, and convert all points.
+	 */
+	f256_invert(zt, z[0]);
+	for (i = 0; i < num; i ++) {
+		f256_montymul(zv, jac[i].z, zt);
+		f256_montysquare(zu, zv);
+		f256_montymul(zv, zv, zu);
+		f256_montymul(aff[i].x, jac[i].x, zu);
+		f256_montymul(aff[i].y, jac[i].y, zv);
+	}
+}
+
+/*
+ * Multiply the provided point by an integer.
+ * Assumptions:
+ *  - Source point is a valid curve point.
+ *  - Source point is not the point-at-infinity.
+ *  - Integer is not 0, and is lower than the curve order.
+ * If these conditions are not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	union {
+		p256_affine aff[15];
+		p256_jacobian jac[15];
+	} window;
+	int i;
+
+	/*
+	 * Compute window, in Jacobian coordinates.
+	 */
+	window.jac[0] = *P;
+	for (i = 2; i < 16; i ++) {
+		window.jac[i - 1] = window.jac[(i >> 1) - 1];
+		if ((i & 1) == 0) {
+			p256_double(&window.jac[i - 1]);
+		} else {
+			p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
+		}
+	}
+
+	/*
+	 * Convert the window points to affine coordinates. Point
+	 * window[0] is the source point, already in affine coordinates.
+	 */
+	window_to_affine(window.aff, window.jac, 15);
+
+	/*
+	 * Perform point multiplication.
+	 */
+	point_mul_inner(P, window.aff, k, klen);
+}
+
+/*
+ * Precomputed window for the conventional generator: P256_Gwin[n]
+ * contains (n+1)*G (affine coordinates, in Montgomery representation).
+ */
+static const p256_affine P256_Gwin[] = {
+	{
+		{ 0x30D418A9143C1, 0xC4FEDB60179E7, 0x62251075BA95F,
+		  0x5C669FB732B77, 0x08905F76B5375 },
+		{ 0x5357CE95560A8, 0x43A19E45CDDF2, 0x21F3258B4AB8E,
+		  0xD8552E88688DD, 0x0571FF18A5885 }
+	},
+	{
+		{ 0x46D410DDD64DF, 0x0B433827D8500, 0x1490D9AA6AE3C,
+		  0xA3A832205038D, 0x06BB32E52DCF3 },
+		{ 0x48D361BEE1A57, 0xB7B236FF82F36, 0x042DBE152CD7C,
+		  0xA3AA9A8FB0E92, 0x08C577517A5B8 }
+	},
+	{
+		{ 0x3F904EEBC1272, 0x9E87D81FBFFAC, 0xCBBC98B027F84,
+		  0x47E46AD77DD87, 0x06936A3FD6FF7 },
+		{ 0x5C1FC983A7EBD, 0xC3861FE1AB04C, 0x2EE98E583E47A,
+		  0xC06A88208311A, 0x05F06A2AB587C }
+	},
+	{
+		{ 0xB50D46918DCC5, 0xD7623C17374B0, 0x100AF24650A6E,
+		  0x76ABCDAACACE8, 0x077362F591B01 },
+		{ 0xF24CE4CBABA68, 0x17AD6F4472D96, 0xDDD22E1762847,
+		  0x862EB6C36DEE5, 0x04B14C39CC5AB }
+	},
+	{
+		{ 0x8AAEC45C61F5C, 0x9D4B9537DBE1B, 0x76C20C90EC649,
+		  0x3C7D41CB5AAD0, 0x0907960649052 },
+		{ 0x9B4AE7BA4F107, 0xF75EB882BEB30, 0x7A1F6873C568E,
+		  0x915C540A9877E, 0x03A076BB9DD1E }
+	},
+	{
+		{ 0x47373E77664A1, 0xF246CEE3E4039, 0x17A3AD55AE744,
+		  0x673C50A961A5B, 0x03074B5964213 },
+		{ 0x6220D377E44BA, 0x30DFF14B593D3, 0x639F11299C2B5,
+		  0x75F5424D44CEF, 0x04C9916DEA07F }
+	},
+	{
+		{ 0x354EA0173B4F1, 0x3C23C00F70746, 0x23BB082BD2021,
+		  0xE03E43EAAB50C, 0x03BA5119D3123 },
+		{ 0xD0303F5B9D4DE, 0x17DA67BDD2847, 0xC941956742F2F,
+		  0x8670F933BDC77, 0x0AEDD9164E240 }
+	},
+	{
+		{ 0x4CD19499A78FB, 0x4BF9B345527F1, 0x2CFC6B462AB5C,
+		  0x30CDF90F02AF0, 0x0763891F62652 },
+		{ 0xA3A9532D49775, 0xD7F9EBA15F59D, 0x60BBF021E3327,
+		  0xF75C23C7B84BE, 0x06EC12F2C706D }
+	},
+	{
+		{ 0x6E8F264E20E8E, 0xC79A7A84175C9, 0xC8EB00ABE6BFE,
+		  0x16A4CC09C0444, 0x005B3081D0C4E },
+		{ 0x777AA45F33140, 0xDCE5D45E31EB7, 0xB12F1A56AF7BE,
+		  0xF9B2B6E019A88, 0x086659CDFD835 }
+	},
+	{
+		{ 0xDBD19DC21EC8C, 0x94FCF81392C18, 0x250B4998F9868,
+		  0x28EB37D2CD648, 0x0C61C947E4B34 },
+		{ 0x407880DD9E767, 0x0C83FBE080C2B, 0x9BE5D2C43A899,
+		  0xAB4EF7D2D6577, 0x08719A555B3B4 }
+	},
+	{
+		{ 0x260A6245E4043, 0x53E7FDFE0EA7D, 0xAC1AB59DE4079,
+		  0x072EFF3A4158D, 0x0E7090F1949C9 },
+		{ 0x85612B944E886, 0xE857F61C81A76, 0xAD643D250F939,
+		  0x88DAC0DAA891E, 0x089300244125B }
+	},
+	{
+		{ 0x1AA7D26977684, 0x58A345A3304B7, 0x37385EABDEDEF,
+		  0x155E409D29DEE, 0x0EE1DF780B83E },
+		{ 0x12D91CBB5B437, 0x65A8956370CAC, 0xDE6D66170ED2F,
+		  0xAC9B8228CFA8A, 0x0FF57C95C3238 }
+	},
+	{
+		{ 0x25634B2ED7097, 0x9156FD30DCCC4, 0x9E98110E35676,
+		  0x7594CBCD43F55, 0x038477ACC395B },
+		{ 0x2B90C00EE17FF, 0xF842ED2E33575, 0x1F5BC16874838,
+		  0x7968CD06422BD, 0x0BC0876AB9E7B }
+	},
+	{
+		{ 0xA35BB0CF664AF, 0x68F9707E3A242, 0x832660126E48F,
+		  0x72D2717BF54C6, 0x0AAE7333ED12C },
+		{ 0x2DB7995D586B1, 0xE732237C227B5, 0x65E7DBBE29569,
+		  0xBBBD8E4193E2A, 0x052706DC3EAA1 }
+	},
+	{
+		{ 0xD8B7BC60055BE, 0xD76E27E4B72BC, 0x81937003CC23E,
+		  0xA090E337424E4, 0x02AA0E43EAD3D },
+		{ 0x524F6383C45D2, 0x422A41B2540B8, 0x8A4797D766355,
+		  0xDF444EFA6DE77, 0x0042170A9079A }
+	},
+};
+
+/*
+ * Multiply the conventional generator of the curve by the provided
+ * integer. Return is written in *P.
+ *
+ * Assumptions:
+ *  - Integer is not 0, and is lower than the curve order.
+ * If this conditions is not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	point_mul_inner(P, P256_Gwin, k, klen);
+}
+
+/*
+ * Return 1 if all of the following hold:
+ *  - klen <= 32
+ *  - k != 0
+ *  - k is lower than the curve order
+ * Otherwise, return 0.
+ *
+ * Constant-time behaviour: only klen may be observable.
+ */
+static uint32_t
+check_scalar(const unsigned char *k, size_t klen)
+{
+	uint32_t z;
+	int32_t c;
+	size_t u;
+
+	if (klen > 32) {
+		return 0;
+	}
+	z = 0;
+	for (u = 0; u < klen; u ++) {
+		z |= k[u];
+	}
+	if (klen == 32) {
+		c = 0;
+		for (u = 0; u < klen; u ++) {
+			c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
+		}
+	} else {
+		c = -1;
+	}
+	return NEQ(z, 0) & LT0(c);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *k, size_t klen, int curve)
+{
+	uint32_t r;
+	p256_jacobian P;
+
+	(void)curve;
+	if (Glen != 65) {
+		return 0;
+	}
+	r = check_scalar(k, klen);
+	r &= point_decode(&P, G);
+	p256_mul(&P, k, klen);
+	r &= point_encode(G, &P);
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *k, size_t klen, int curve)
+{
+	p256_jacobian P;
+
+	(void)curve;
+	p256_mulgen(&P, k, klen);
+	point_encode(R, &P);
+	return 65;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We might want to use Shamir's trick here: make a composite
+	 * window of u*P+v*Q points, to merge the two doubling-ladders
+	 * into one. This, however, has some complications:
+	 *
+	 *  - During the computation, we may hit the point-at-infinity.
+	 *    Thus, we would need p256_add_complete_mixed() (complete
+	 *    formulas for point addition), with a higher cost (17 muls
+	 *    instead of 11).
+	 *
+	 *  - A 4-bit window would be too large, since it would involve
+	 *    16*16-1 = 255 points. For the same window size as in the
+	 *    p256_mul() case, we would need to reduce the window size
+	 *    to 2 bits, and thus perform twice as many non-doubling
+	 *    point additions.
+	 *
+	 *  - The window may itself contain the point-at-infinity, and
+	 *    thus cannot be in all generality be made of affine points.
+	 *    Instead, we would need to make it a window of points in
+	 *    Jacobian coordinates. Even p256_add_complete_mixed() would
+	 *    be inappropriate.
+	 *
+	 * For these reasons, the code below performs two separate
+	 * point multiplications, then computes the final point addition
+	 * (which is both a "normal" addition, and a doubling, to handle
+	 * all cases).
+	 */
+
+	p256_jacobian P, Q;
+	uint32_t r, t, s;
+	uint64_t z;
+
+	(void)curve;
+	if (len != 65) {
+		return 0;
+	}
+	r = point_decode(&P, A);
+	p256_mul(&P, x, xlen);
+	if (B == NULL) {
+		p256_mulgen(&Q, y, ylen);
+	} else {
+		r &= point_decode(&Q, B);
+		p256_mul(&Q, y, ylen);
+	}
+
+	/*
+	 * The final addition may fail in case both points are equal.
+	 */
+	t = p256_add(&P, &Q);
+	f256_final_reduce(P.z);
+	z = P.z[0] | P.z[1] | P.z[2] | P.z[3] | P.z[4];
+	s = EQ((uint32_t)(z | (z >> 32)), 0);
+	p256_double(&Q);
+
+	/*
+	 * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   s = 0, t = 0   return P (normal addition)
+	 *   s = 0, t = 1   return P (normal addition)
+	 *   s = 1, t = 0   return Q (a 'double' case)
+	 *   s = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(s & ~t, &P, &Q, sizeof Q);
+	point_encode(A, &P);
+	r &= ~(s & t);
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m62 = {
+	(uint32_t)0x00800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m62_get(void)
+{
+	return &br_ec_p256_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m62_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/bearssl/src/ec/ec_p256_m64.c b/src/bearssl/src/ec/ec_p256_m64.c
new file mode 100644
index 0000000..5a7ea17
--- /dev/null
+++ b/src/bearssl/src/ec/ec_p256_m64.c
@@ -0,0 +1,1730 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+	0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+	0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_G;
+	return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_N;
+	return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 1;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^64.
+ * Values may reach up to 2^256-1. Montgomery multiplication is used.
+ */
+
+/* R = 2^256 mod p */
+static const uint64_t F256_R[] = {
+	0x0000000000000001, 0xFFFFFFFF00000000,
+	0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFE
+};
+
+/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
+   (Montgomery representation of B). */
+static const uint64_t P256_B_MONTY[] = {
+	0xD89CDF6229C4BDDF, 0xACF005CD78843090,
+	0xE5A220ABF7212ED6, 0xDC30061D04874834
+};
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+	unsigned __int128 w;
+	uint64_t t;
+
+	w = (unsigned __int128)a[0] + b[0];
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)a[1] + b[1] + (w >> 64);
+	d[1] = (uint64_t)w;
+	w = (unsigned __int128)a[2] + b[2] + (w >> 64);
+	d[2] = (uint64_t)w;
+	w = (unsigned __int128)a[3] + b[3] + (w >> 64);
+	d[3] = (uint64_t)w;
+	t = (uint64_t)(w >> 64);
+
+	/*
+	 * 2^256 = 2^224 - 2^192 - 2^96 + 1 in the field.
+	 */
+	w = (unsigned __int128)d[0] + t;
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
+	d[1] = (uint64_t)w;
+	/* Here, carry "w >> 64" can only be 0 or -1 */
+	w = (unsigned __int128)d[2] - ((w >> 64) & 1);
+	d[2] = (uint64_t)w;
+	/* Again, carry is 0 or -1 */
+	d[3] += (uint64_t)(w >> 64) + (t << 32) - t;
+
+#elif BR_UMUL128
+
+	unsigned char cc;
+	uint64_t t;
+
+	cc = _addcarry_u64(0, a[0], b[0], &d[0]);
+	cc = _addcarry_u64(cc, a[1], b[1], &d[1]);
+	cc = _addcarry_u64(cc, a[2], b[2], &d[2]);
+	cc = _addcarry_u64(cc, a[3], b[3], &d[3]);
+
+	/*
+	 * If there is a carry, then we want to subtract p, which we
+	 * do by adding 2^256 - p.
+	 */
+	t = cc;
+	cc = _addcarry_u64(cc, d[0], 0, &d[0]);
+	cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
+	cc = _addcarry_u64(cc, d[2], -t, &d[2]);
+	(void)_addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction in the field.
+ */
+static inline void
+f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	unsigned __int128 w;
+	uint64_t t;
+
+	w = (unsigned __int128)a[0] - b[0];
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)a[1] - b[1] - ((w >> 64) & 1);
+	d[1] = (uint64_t)w;
+	w = (unsigned __int128)a[2] - b[2] - ((w >> 64) & 1);
+	d[2] = (uint64_t)w;
+	w = (unsigned __int128)a[3] - b[3] - ((w >> 64) & 1);
+	d[3] = (uint64_t)w;
+	t = (uint64_t)(w >> 64) & 1;
+
+	/*
+	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
+	 */
+	w = (unsigned __int128)d[0] - t;
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
+	d[1] = (uint64_t)w;
+	/* Here, carry "w >> 64" can only be 0 or +1 */
+	w = (unsigned __int128)d[2] + (w >> 64);
+	d[2] = (uint64_t)w;
+	/* Again, carry is 0 or +1 */
+	d[3] += (uint64_t)(w >> 64) - (t << 32) + t;
+
+#elif BR_UMUL128
+
+	unsigned char cc;
+	uint64_t t;
+
+	cc = _subborrow_u64(0, a[0], b[0], &d[0]);
+	cc = _subborrow_u64(cc, a[1], b[1], &d[1]);
+	cc = _subborrow_u64(cc, a[2], b[2], &d[2]);
+	cc = _subborrow_u64(cc, a[3], b[3], &d[3]);
+
+	/*
+	 * If there is a carry, then we need to add p.
+	 */
+	t = cc;
+	cc = _addcarry_u64(0, d[0], -t, &d[0]);
+	cc = _addcarry_u64(cc, d[1], (-t) >> 32, &d[1]);
+	cc = _addcarry_u64(cc, d[2], 0, &d[2]);
+	(void)_addcarry_u64(cc, d[3], t - (t << 32), &d[3]);
+
+#endif
+}
+
+/*
+ * Montgomery multiplication in the field.
+ */
+static void
+f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	uint64_t x, f, t0, t1, t2, t3, t4;
+	unsigned __int128 z, ff;
+	int i;
+
+	/*
+	 * When computing d <- d + a[u]*b, we also add f*p such
+	 * that d + a[u]*b + f*p is a multiple of 2^64. Since
+	 * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
+	 */
+
+	/*
+	 * Step 1: t <- (a[0]*b + f*p) / 2^64
+	 * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
+	 * ensures that (a[0]*b + f*p) is a multiple of 2^64.
+	 *
+	 * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
+	 */
+	x = a[0];
+	z = (unsigned __int128)b[0] * x;
+	f = (uint64_t)z;
+	z = (unsigned __int128)b[1] * x + (z >> 64) + (uint64_t)(f << 32);
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)b[2] * x + (z >> 64) + (uint64_t)(f >> 32);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)b[3] * x + (z >> 64) + f;
+	t2 = (uint64_t)z;
+	t3 = (uint64_t)(z >> 64);
+	ff = ((unsigned __int128)f << 64) - ((unsigned __int128)f << 32);
+	z = (unsigned __int128)t2 + (uint64_t)ff;
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
+	t3 = (uint64_t)z;
+	t4 = (uint64_t)(z >> 64);
+
+	/*
+	 * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
+	 */
+	for (i = 1; i < 4; i ++) {
+		x = a[i];
+
+		/* t <- (t + x*b - f) / 2^64 */
+		z = (unsigned __int128)b[0] * x + t0;
+		f = (uint64_t)z;
+		z = (unsigned __int128)b[1] * x + t1 + (z >> 64);
+		t0 = (uint64_t)z;
+		z = (unsigned __int128)b[2] * x + t2 + (z >> 64);
+		t1 = (uint64_t)z;
+		z = (unsigned __int128)b[3] * x + t3 + (z >> 64);
+		t2 = (uint64_t)z;
+		z = t4 + (z >> 64);
+		t3 = (uint64_t)z;
+		t4 = (uint64_t)(z >> 64);
+
+		/* t <- t + f*2^32, carry in the upper half of z */
+		z = (unsigned __int128)t0 + (uint64_t)(f << 32);
+		t0 = (uint64_t)z;
+		z = (z >> 64) + (unsigned __int128)t1 + (uint64_t)(f >> 32);
+		t1 = (uint64_t)z;
+
+		/* t <- t + f*2^192 - f*2^160 + f*2^128 */
+		ff = ((unsigned __int128)f << 64) 
+			- ((unsigned __int128)f << 32) + f;
+		z = (z >> 64) + (unsigned __int128)t2 + (uint64_t)ff;
+		t2 = (uint64_t)z;
+		z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
+		t3 = (uint64_t)z;
+		t4 += (uint64_t)(z >> 64);
+	}
+
+	/*
+	 * At that point, we have computed t = (a*b + F*p) / 2^256, where
+	 * F is a 256-bit integer whose limbs are the "f" coefficients
+	 * in the steps above. We have:
+	 *   a <= 2^256-1
+	 *   b <= 2^256-1
+	 *   F <= 2^256-1
+	 * Hence:
+	 *   a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
+	 *   a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
+	 * Therefore:
+	 *   t < 2^256 + p - 2
+	 * Since p < 2^256, it follows that:
+	 *   t4 can be only 0 or 1
+	 *   t - p < 2^256
+	 * We can therefore subtract p from t, conditionally on t4, to
+	 * get a nonnegative result that fits on 256 bits.
+	 */
+	z = (unsigned __int128)t0 + t4;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 - (t4 << 32) + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 - (z >> 127);
+	t2 = (uint64_t)z;
+	t3 = t3 - (uint64_t)(z >> 127) - t4 + (t4 << 32);
+
+	d[0] = t0;
+	d[1] = t1;
+	d[2] = t2;
+	d[3] = t3;
+
+#elif BR_UMUL128
+
+	uint64_t x, f, t0, t1, t2, t3, t4;
+	uint64_t zl, zh, ffl, ffh;
+	unsigned char k, m;
+	int i;
+
+	/*
+	 * When computing d <- d + a[u]*b, we also add f*p such
+	 * that d + a[u]*b + f*p is a multiple of 2^64. Since
+	 * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
+	 */
+
+	/*
+	 * Step 1: t <- (a[0]*b + f*p) / 2^64
+	 * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
+	 * ensures that (a[0]*b + f*p) is a multiple of 2^64.
+	 *
+	 * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
+	 */
+	x = a[0];
+
+	zl = _umul128(b[0], x, &zh);
+	f = zl;
+	t0 = zh;
+
+	zl = _umul128(b[1], x, &zh);
+	k = _addcarry_u64(0, zl, t0, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	k = _addcarry_u64(0, zl, f << 32, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	t0 = zl;
+	t1 = zh;
+
+	zl = _umul128(b[2], x, &zh);
+	k = _addcarry_u64(0, zl, t1, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	k = _addcarry_u64(0, zl, f >> 32, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	t1 = zl;
+	t2 = zh;
+
+	zl = _umul128(b[3], x, &zh);
+	k = _addcarry_u64(0, zl, t2, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	k = _addcarry_u64(0, zl, f, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	t2 = zl;
+	t3 = zh;
+
+	t4 = _addcarry_u64(0, t3, f, &t3);
+	k = _subborrow_u64(0, t2, f << 32, &t2);
+	k = _subborrow_u64(k, t3, f >> 32, &t3);
+	(void)_subborrow_u64(k, t4, 0, &t4);
+
+	/*
+	 * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
+	 */
+	for (i = 1; i < 4; i ++) {
+		x = a[i];
+		/* f = t0 + x * b[0]; -- computed below */
+
+		/* t <- (t + x*b - f) / 2^64 */
+		zl = _umul128(b[0], x, &zh);
+		k = _addcarry_u64(0, zl, t0, &f);
+		(void)_addcarry_u64(k, zh, 0, &t0);
+
+		zl = _umul128(b[1], x, &zh);
+		k = _addcarry_u64(0, zl, t0, &zl);
+		(void)_addcarry_u64(k, zh, 0, &zh);
+		k = _addcarry_u64(0, zl, t1, &t0);
+		(void)_addcarry_u64(k, zh, 0, &t1);
+
+		zl = _umul128(b[2], x, &zh);
+		k = _addcarry_u64(0, zl, t1, &zl);
+		(void)_addcarry_u64(k, zh, 0, &zh);
+		k = _addcarry_u64(0, zl, t2, &t1);
+		(void)_addcarry_u64(k, zh, 0, &t2);
+
+		zl = _umul128(b[3], x, &zh);
+		k = _addcarry_u64(0, zl, t2, &zl);
+		(void)_addcarry_u64(k, zh, 0, &zh);
+		k = _addcarry_u64(0, zl, t3, &t2);
+		(void)_addcarry_u64(k, zh, 0, &t3);
+
+		t4 = _addcarry_u64(0, t3, t4, &t3);
+
+		/* t <- t + f*2^32, carry in k */
+		k = _addcarry_u64(0, t0, f << 32, &t0);
+		k = _addcarry_u64(k, t1, f >> 32, &t1);
+
+		/* t <- t + f*2^192 - f*2^160 + f*2^128 */
+		m = _subborrow_u64(0, f, f << 32, &ffl);
+		(void)_subborrow_u64(m, f, f >> 32, &ffh);
+		k = _addcarry_u64(k, t2, ffl, &t2);
+		k = _addcarry_u64(k, t3, ffh, &t3);
+		(void)_addcarry_u64(k, t4, 0, &t4);
+	}
+
+	/*
+	 * At that point, we have computed t = (a*b + F*p) / 2^256, where
+	 * F is a 256-bit integer whose limbs are the "f" coefficients
+	 * in the steps above. We have:
+	 *   a <= 2^256-1
+	 *   b <= 2^256-1
+	 *   F <= 2^256-1
+	 * Hence:
+	 *   a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
+	 *   a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
+	 * Therefore:
+	 *   t < 2^256 + p - 2
+	 * Since p < 2^256, it follows that:
+	 *   t4 can be only 0 or 1
+	 *   t - p < 2^256
+	 * We can therefore subtract p from t, conditionally on t4, to
+	 * get a nonnegative result that fits on 256 bits.
+	 */
+	k = _addcarry_u64(0, t0, t4, &t0);
+	k = _addcarry_u64(k, t1, -(t4 << 32), &t1);
+	k = _addcarry_u64(k, t2, -t4, &t2);
+	(void)_addcarry_u64(k, t3, (t4 << 32) - (t4 << 1), &t3);
+
+	d[0] = t0;
+	d[1] = t1;
+	d[2] = t2;
+	d[3] = t3;
+
+#endif
+}
+
+/*
+ * Montgomery squaring in the field; currently a basic wrapper around
+ * multiplication (inline, should be optimized away).
+ * TODO: see if some extra speed can be gained here.
+ */
+static inline void
+f256_montysquare(uint64_t *d, const uint64_t *a)
+{
+	f256_montymul(d, a, a);
+}
+
+/*
+ * Convert to Montgomery representation.
+ */
+static void
+f256_tomonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * R2 = 2^512 mod p.
+	 * If R = 2^256 mod p, then R2 = R^2 mod p; and the Montgomery
+	 * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
+	 * conversion to Montgomery representation.
+	 */
+	static const uint64_t R2[] = {
+		0x0000000000000003,
+		0xFFFFFFFBFFFFFFFF,
+		0xFFFFFFFFFFFFFFFE,
+		0x00000004FFFFFFFD
+	};
+
+	f256_montymul(d, a, R2);
+}
+
+/*
+ * Convert from Montgomery representation.
+ */
+static void
+f256_frommonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * Montgomery multiplication by 1 is division by 2^256 modulo p.
+	 */
+	static const uint64_t one[] = { 1, 0, 0, 0 };
+
+	f256_montymul(d, a, one);
+}
+
+/*
+ * Inversion in the field. If the source value is 0 modulo p, then this
+ * returns 0 or p. This function uses Montgomery representation.
+ */
+static void
+f256_invert(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * We compute a^(p-2) mod p. The exponent pattern (from high to
+	 * low) is:
+	 *  - 32 bits of value 1
+	 *  - 31 bits of value 0
+	 *  - 1 bit of value 1
+	 *  - 96 bits of value 0
+	 *  - 94 bits of value 1
+	 *  - 1 bit of value 0
+	 *  - 1 bit of value 1
+	 * To speed up the square-and-multiply algorithm, we precompute
+	 * a^(2^31-1).
+	 */
+
+	uint64_t r[4], t[4];
+	int i;
+
+	memcpy(t, a, sizeof t);
+	for (i = 0; i < 30; i ++) {
+		f256_montysquare(t, t);
+		f256_montymul(t, t, a);
+	}
+
+	memcpy(r, t, sizeof t);
+	for (i = 224; i >= 0; i --) {
+		f256_montysquare(r, r);
+		switch (i) {
+		case 0:
+		case 2:
+		case 192:
+		case 224:
+			f256_montymul(r, r, a);
+			break;
+		case 3:
+		case 34:
+		case 65:
+			f256_montymul(r, r, t);
+			break;
+		}
+	}
+	memcpy(d, r, sizeof r);
+}
+
+/*
+ * Finalize reduction.
+ * Input value fits on 256 bits. This function subtracts p if and only
+ * if the input is greater than or equal to p.
+ */
+static inline void
+f256_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned __int128 z;
+
+	/*
+	 * We add 2^224 - 2^192 - 2^96 + 1 to a. If there is no carry,
+	 * then a < p; otherwise, the addition result we computed is
+	 * the value we must return.
+	 */
+	z = (unsigned __int128)a[0] + 1;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (z >> 64) - ((uint64_t)1 << 32);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] - (z >> 127);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] - (z >> 127) + 0xFFFFFFFF;
+	t3 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+
+	a[0] ^= cc & (a[0] ^ t0);
+	a[1] ^= cc & (a[1] ^ t1);
+	a[2] ^= cc & (a[2] ^ t2);
+	a[3] ^= cc & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned char k;
+
+	k = _addcarry_u64(0, a[0], (uint64_t)1, &t0);
+	k = _addcarry_u64(k, a[1], -((uint64_t)1 << 32), &t1);
+	k = _addcarry_u64(k, a[2], -(uint64_t)1, &t2);
+	k = _addcarry_u64(k, a[3], ((uint64_t)1 << 32) - 2, &t3);
+	m = -(uint64_t)k;
+
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+/*
+ * Points in affine and Jacobian coordinates.
+ *
+ *  - In affine coordinates, the point-at-infinity cannot be encoded.
+ *  - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
+ *    if Z = 0 then this is the point-at-infinity.
+ */
+typedef struct {
+	uint64_t x[4];
+	uint64_t y[4];
+} p256_affine;
+
+typedef struct {
+	uint64_t x[4];
+	uint64_t y[4];
+	uint64_t z[4];
+} p256_jacobian;
+
+/*
+ * Decode a point. The returned point is in Jacobian coordinates, but
+ * with z = 1. If the encoding is invalid, or encodes a point which is
+ * not on the curve, or encodes the point at infinity, then this function
+ * returns 0. Otherwise, 1 is returned.
+ *
+ * The buffer is assumed to have length exactly 65 bytes.
+ */
+static uint32_t
+point_decode(p256_jacobian *P, const unsigned char *buf)
+{
+	uint64_t x[4], y[4], t[4], x3[4], tt;
+	uint32_t r;
+
+	/*
+	 * Header byte shall be 0x04.
+	 */
+	r = EQ(buf[0], 0x04);
+
+	/*
+	 * Decode X and Y coordinates, and convert them into
+	 * Montgomery representation.
+	 */
+	x[3] = br_dec64be(buf +  1);
+	x[2] = br_dec64be(buf +  9);
+	x[1] = br_dec64be(buf + 17);
+	x[0] = br_dec64be(buf + 25);
+	y[3] = br_dec64be(buf + 33);
+	y[2] = br_dec64be(buf + 41);
+	y[1] = br_dec64be(buf + 49);
+	y[0] = br_dec64be(buf + 57);
+	f256_tomonty(x, x);
+	f256_tomonty(y, y);
+
+	/*
+	 * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
+	 * Note that the Montgomery representation of 0 is 0. We must
+	 * take care to apply the final reduction to make sure we have
+	 * 0 and not p.
+	 */
+	f256_montysquare(t, y);
+	f256_montysquare(x3, x);
+	f256_montymul(x3, x3, x);
+	f256_sub(t, t, x3);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_sub(t, t, P256_B_MONTY);
+	f256_final_reduce(t);
+	tt = t[0] | t[1] | t[2] | t[3];
+	r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
+
+	/*
+	 * Return the point in Jacobian coordinates (and Montgomery
+	 * representation).
+	 */
+	memcpy(P->x, x, sizeof x);
+	memcpy(P->y, y, sizeof y);
+	memcpy(P->z, F256_R, sizeof F256_R);
+	return r;
+}
+
+/*
+ * Final conversion for a point:
+ *  - The point is converted back to affine coordinates.
+ *  - Final reduction is performed.
+ *  - The point is encoded into the provided buffer.
+ *
+ * If the point is the point-at-infinity, all operations are performed,
+ * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
+ * the encoded point is written in the buffer, and 1 is returned.
+ */
+static uint32_t
+point_encode(unsigned char *buf, const p256_jacobian *P)
+{
+	uint64_t t1[4], t2[4], z;
+
+	/* Set t1 = 1/z^2 and t2 = 1/z^3. */
+	f256_invert(t2, P->z);
+	f256_montysquare(t1, t2);
+	f256_montymul(t2, t2, t1);
+
+	/* Compute affine coordinates x (in t1) and y (in t2). */
+	f256_montymul(t1, P->x, t1);
+	f256_montymul(t2, P->y, t2);
+
+	/* Convert back from Montgomery representation, and finalize
+	   reductions. */
+	f256_frommonty(t1, t1);
+	f256_frommonty(t2, t2);
+	f256_final_reduce(t1);
+	f256_final_reduce(t2);
+
+	/* Encode. */
+	buf[0] = 0x04;
+	br_enc64be(buf +  1, t1[3]);
+	br_enc64be(buf +  9, t1[2]);
+	br_enc64be(buf + 17, t1[1]);
+	br_enc64be(buf + 25, t1[0]);
+	br_enc64be(buf + 33, t2[3]);
+	br_enc64be(buf + 41, t2[2]);
+	br_enc64be(buf + 49, t2[1]);
+	br_enc64be(buf + 57, t2[0]);
+
+	/* Return success if and only if P->z != 0. */
+	z = P->z[0] | P->z[1] | P->z[2] | P->z[3];
+	return NEQ((uint32_t)(z | z >> 32), 0);
+}
+
+/*
+ * Point doubling in Jacobian coordinates: point P is doubled.
+ * Note: if the source point is the point-at-infinity, then the result is
+ * still the point-at-infinity, which is correct. Moreover, if the three
+ * coordinates were zero, then they still are zero in the returned value.
+ *
+ * (Note: this is true even without the final reduction: if the three
+ * coordinates are encoded as four words of value zero each, then the
+ * result will also have all-zero coordinate encodings, not the alternate
+ * encoding as the integer p.)
+ */
+static void
+p256_double(p256_jacobian *P)
+{
+	/*
+	 * Doubling formulas are:
+	 *
+	 *   s = 4*x*y^2
+	 *   m = 3*(x + z^2)*(x - z^2)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y^4
+	 *   z' = 2*y*z
+	 *
+	 * These formulas work for all points, including points of order 2
+	 * and points at infinity:
+	 *   - If y = 0 then z' = 0. But there is no such point in P-256
+	 *     anyway.
+	 *   - If z = 0 then z' = 0.
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4];
+
+	/*
+	 * Compute z^2 in t1.
+	 */
+	f256_montysquare(t1, P->z);
+
+	/*
+	 * Compute x-z^2 in t2 and x+z^2 in t1.
+	 */
+	f256_add(t2, P->x, t1);
+	f256_sub(t1, P->x, t1);
+
+	/*
+	 * Compute 3*(x+z^2)*(x-z^2) in t1.
+	 */
+	f256_montymul(t3, t1, t2);
+	f256_add(t1, t3, t3);
+	f256_add(t1, t3, t1);
+
+	/*
+	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	f256_montysquare(t3, P->y);
+	f256_add(t3, t3, t3);
+	f256_montymul(t2, P->x, t3);
+	f256_add(t2, t2, t2);
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	f256_montysquare(P->x, t1);
+	f256_sub(P->x, P->x, t2);
+	f256_sub(P->x, P->x, t2);
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	f256_montymul(t4, P->y, P->z);
+	f256_add(P->z, t4, t4);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	f256_sub(t2, t2, P->x);
+	f256_montymul(P->y, t1, t2);
+	f256_montysquare(t4, t3);
+	f256_add(t4, t4, t4);
+	f256_sub(P->y, P->y, t4);
+}
+
+/*
+ * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0 but P2 != 0
+ *   - If P1 != 0 but P2 == 0
+ *   - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate.
+ *   - P1 == 0 and P2 == 0.
+ *   - The Y coordinate of one of the points is 0 and the other point is
+ *     the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Note that you can get a returned value of 0 with a correct result,
+ * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1 * z2^2
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1 * z2^3
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1 * z2
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	f256_montysquare(t3, P2->z);
+	f256_montymul(t1, P1->x, t3);
+	f256_montymul(t4, P2->z, t3);
+	f256_montymul(t3, P1->y, t4);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(t1, P1->z, P2->z);
+	f256_montymul(P1->z, t1, t2);
+
+	return ret;
+}
+
+/*
+ * Point addition (mixed coordinates): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0
+ *   - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y (affine) coordinate.
+ *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Again, a value of 0 may be returned in some cases where the addition
+ * result is correct.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	return ret;
+}
+
+#if 0
+/* unused */
+/*
+ * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function returns the correct result in all cases.
+ */
+static uint32_t
+p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas, in the general case, are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 *
+	 * These formulas mishandle the two following cases:
+	 *
+	 *  - If P1 is the point-at-infinity (z1 = 0), then z3 is
+	 *    incorrectly set to 0.
+	 *
+	 *  - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
+	 *    are all set to 0.
+	 *
+	 * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
+	 * we correctly get z3 = 0 (the point-at-infinity).
+	 *
+	 * To fix the case P1 = 0, we perform at the end a copy of P2
+	 * over P1, conditional to z1 = 0.
+	 *
+	 * For P1 = P2: in that case, both h and r are set to 0, and
+	 * we get x3, y3 and z3 equal to 0. We can test for that
+	 * occurrence to make a mask which will be all-one if P1 = P2,
+	 * or all-zero otherwise; then we can compute the double of P2
+	 * and add it, combined with the mask, to (x3,y3,z3).
+	 *
+	 * Using the doubling formulas in p256_double() on (x2,y2),
+	 * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
+	 * we get:
+	 *   s = 4*x2*y2^2
+	 *   m = 3*(x2 + 1)*(x2 - 1)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y2^4
+	 *   z' = 2*y2
+	 * which requires only 6 multiplications. Added to the 11
+	 * multiplications of the normal mixed addition in Jacobian
+	 * coordinates, we get a cost of 17 multiplications in total.
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt, zz;
+	int i;
+
+	/*
+	 * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
+	 */
+	zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3];
+	zz = ((zz | -zz) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+
+	/*
+	 * If both h = 0 and r = 0, then P1 = P2, and we want to set
+	 * the mask tt to -1; otherwise, the mask will be 0.
+	 */
+	f256_final_reduce(t2);
+	f256_final_reduce(t4);
+	tt = t2[0] | t2[1] | t2[2] | t2[3] | t4[0] | t4[1] | t4[2] | t4[3];
+	tt = ((tt | -tt) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	/*
+	 * The "double" result, in case P1 = P2.
+	 */
+
+	/*
+	 * Compute z' = 2*y2 (in t1).
+	 */
+	f256_add(t1, P2->y, P2->y);
+
+	/*
+	 * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
+	 */
+	f256_montysquare(t2, P2->y);
+	f256_add(t2, t2, t2);
+	f256_add(t3, t2, t2);
+	f256_montymul(t3, P2->x, t3);
+
+	/*
+	 * Compute m = 3*(x2^2 - 1) (in t4).
+	 */
+	f256_montysquare(t4, P2->x);
+	f256_sub(t4, t4, F256_R);
+	f256_add(t5, t4, t4);
+	f256_add(t4, t4, t5);
+
+	/*
+	 * Compute x' = m^2 - 2*s (in t5).
+	 */
+	f256_montysquare(t5, t4);
+	f256_sub(t5, t3);
+	f256_sub(t5, t3);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y2^4 (in t6).
+	 */
+	f256_sub(t6, t3, t5);
+	f256_montymul(t6, t6, t4);
+	f256_montysquare(t7, t2);
+	f256_sub(t6, t6, t7);
+	f256_sub(t6, t6, t7);
+
+	/*
+	 * We now have the alternate (doubling) coordinates in (t5,t6,t1).
+	 * We combine them with (x3,y3,z3).
+	 */
+	for (i = 0; i < 4; i ++) {
+		P1->x[i] |= tt & t5[i];
+		P1->y[i] |= tt & t6[i];
+		P1->z[i] |= tt & t1[i];
+	}
+
+	/*
+	 * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
+	 * then we want to replace the result with a copy of P2. The
+	 * test on z1 was done at the start, in the zz mask.
+	 */
+	for (i = 0; i < 4; i ++) {
+		P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
+		P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
+		P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
+	}
+}
+#endif
+
+/*
+ * Inner function for computing a point multiplication. A window is
+ * provided, with points 1*P to 15*P in affine coordinates.
+ *
+ * Assumptions:
+ *  - All provided points are valid points on the curve.
+ *  - Multiplier is non-zero, and smaller than the curve order.
+ *  - Everything is in Montgomery representation.
+ */
+static void
+point_mul_inner(p256_jacobian *R, const p256_affine *W,
+	const unsigned char *k, size_t klen)
+{
+	p256_jacobian Q;
+	uint32_t qz;
+
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (klen -- > 0) {
+		int i;
+		unsigned bk;
+
+		bk = *k ++;
+		for (i = 0; i < 2; i ++) {
+			uint32_t bits;
+			uint32_t bnz;
+			p256_affine T;
+			p256_jacobian U;
+			uint32_t n;
+			int j;
+			uint64_t m;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			bits = (bk >> 4) & 0x0F;
+			bnz = NEQ(bits, 0);
+
+			/*
+			 * Lookup point in window. If the bits are 0,
+			 * we get something invalid, which is not a
+			 * problem because we will use it only if the
+			 * bits are non-zero.
+			 */
+			memset(&T, 0, sizeof T);
+			for (n = 0; n < 15; n ++) {
+				m = -(uint64_t)EQ(bits, n + 1);
+				T.x[0] |= m & W[n].x[0];
+				T.x[1] |= m & W[n].x[1];
+				T.x[2] |= m & W[n].x[2];
+				T.x[3] |= m & W[n].x[3];
+				T.y[0] |= m & W[n].y[0];
+				T.y[1] |= m & W[n].y[1];
+				T.y[2] |= m & W[n].y[2];
+				T.y[3] |= m & W[n].y[3];
+			}
+
+			U = Q;
+			p256_add_mixed(&U, &T);
+
+			/*
+			 * If qz is still 1, then Q was all-zeros, and this
+			 * is conserved through p256_double().
+			 */
+			m = -(uint64_t)(bnz & qz);
+			for (j = 0; j < 4; j ++) {
+				Q.x[j] |= m & T.x[j];
+				Q.y[j] |= m & T.y[j];
+				Q.z[j] |= m & F256_R[j];
+			}
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+			bk <<= 4;
+		}
+	}
+	*R = Q;
+}
+
+/*
+ * Convert a window from Jacobian to affine coordinates. A single
+ * field inversion is used. This function works for windows up to
+ * 32 elements.
+ *
+ * The destination array (aff[]) and the source array (jac[]) may
+ * overlap, provided that the start of aff[] is not after the start of
+ * jac[]. Even if the arrays do _not_ overlap, the source array is
+ * modified.
+ */
+static void
+window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
+{
+	/*
+	 * Convert the window points to affine coordinates. We use the
+	 * following trick to mutualize the inversion computation: if
+	 * we have z1, z2, z3, and z4, and want to inverse all of them,
+	 * we compute u = 1/(z1*z2*z3*z4), and then we have:
+	 *   1/z1 = u*z2*z3*z4
+	 *   1/z2 = u*z1*z3*z4
+	 *   1/z3 = u*z1*z2*z4
+	 *   1/z4 = u*z1*z2*z3
+	 *
+	 * The partial products are computed recursively:
+	 *
+	 *  - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
+	 *  - on input (z_1,z_2,... z_n):
+	 *       recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
+	 *       recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
+	 *       multiply elements of r1 by m2 -> s1
+	 *       multiply elements of r2 by m1 -> s2
+	 *       return r1||r2 and m1*m2
+	 *
+	 * In the example below, we suppose that we have 14 elements.
+	 * Let z1, z2,... zE be the 14 values to invert (index noted in
+	 * hexadecimal, starting at 1).
+	 *
+	 *  - Depth 1:
+	 *      swap(z1, z2); z12 = z1*z2
+	 *      swap(z3, z4); z34 = z3*z4
+	 *      swap(z5, z6); z56 = z5*z6
+	 *      swap(z7, z8); z78 = z7*z8
+	 *      swap(z9, zA); z9A = z9*zA
+	 *      swap(zB, zC); zBC = zB*zC
+	 *      swap(zD, zE); zDE = zD*zE
+	 *
+	 *  - Depth 2:
+	 *      z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
+	 *      z1234 = z12*z34
+	 *      z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
+	 *      z5678 = z56*z78
+	 *      z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
+	 *      z9ABC = z9A*zBC
+	 *
+	 *  - Depth 3:
+	 *      z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
+	 *      z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
+	 *      z12345678 = z1234*z5678
+	 *      z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
+	 *      zD <- zD*z9ABC, zE*z9ABC
+	 *      z9ABCDE = z9ABC*zDE
+	 *
+	 *  - Depth 4:
+	 *      multiply z1..z8 by z9ABCDE
+	 *      multiply z9..zE by z12345678
+	 *      final z = z12345678*z9ABCDE
+	 */
+
+	uint64_t z[16][4];
+	int i, k, s;
+#define zt   (z[15])
+#define zu   (z[14])
+#define zv   (z[13])
+
+	/*
+	 * First recursion step (pairwise swapping and multiplication).
+	 * If there is an odd number of elements, then we "invent" an
+	 * extra one with coordinate Z = 1 (in Montgomery representation).
+	 */
+	for (i = 0; (i + 1) < num; i += 2) {
+		memcpy(zt, jac[i].z, sizeof zt);
+		memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
+		memcpy(jac[i + 1].z, zt, sizeof zt);
+		f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
+	}
+	if ((num & 1) != 0) {
+		memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
+		memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
+	}
+
+	/*
+	 * Perform further recursion steps. At the entry of each step,
+	 * the process has been done for groups of 's' points. The
+	 * integer k is the log2 of s.
+	 */
+	for (k = 1, s = 2; s < num; k ++, s <<= 1) {
+		int n;
+
+		for (i = 0; i < num; i ++) {
+			f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
+		}
+		n = (num + s - 1) >> k;
+		for (i = 0; i < (n >> 1); i ++) {
+			f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
+		}
+		if ((n & 1) != 0) {
+			memmove(z[n >> 1], z[n], sizeof zt);
+		}
+	}
+
+	/*
+	 * Invert the final result, and convert all points.
+	 */
+	f256_invert(zt, z[0]);
+	for (i = 0; i < num; i ++) {
+		f256_montymul(zv, jac[i].z, zt);
+		f256_montysquare(zu, zv);
+		f256_montymul(zv, zv, zu);
+		f256_montymul(aff[i].x, jac[i].x, zu);
+		f256_montymul(aff[i].y, jac[i].y, zv);
+	}
+}
+
+/*
+ * Multiply the provided point by an integer.
+ * Assumptions:
+ *  - Source point is a valid curve point.
+ *  - Source point is not the point-at-infinity.
+ *  - Integer is not 0, and is lower than the curve order.
+ * If these conditions are not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	union {
+		p256_affine aff[15];
+		p256_jacobian jac[15];
+	} window;
+	int i;
+
+	/*
+	 * Compute window, in Jacobian coordinates.
+	 */
+	window.jac[0] = *P;
+	for (i = 2; i < 16; i ++) {
+		window.jac[i - 1] = window.jac[(i >> 1) - 1];
+		if ((i & 1) == 0) {
+			p256_double(&window.jac[i - 1]);
+		} else {
+			p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
+		}
+	}
+
+	/*
+	 * Convert the window points to affine coordinates. Point
+	 * window[0] is the source point, already in affine coordinates.
+	 */
+	window_to_affine(window.aff, window.jac, 15);
+
+	/*
+	 * Perform point multiplication.
+	 */
+	point_mul_inner(P, window.aff, k, klen);
+}
+
+/*
+ * Precomputed window for the conventional generator: P256_Gwin[n]
+ * contains (n+1)*G (affine coordinates, in Montgomery representation).
+ */
+static const p256_affine P256_Gwin[] = {
+	{
+		{ 0x79E730D418A9143C, 0x75BA95FC5FEDB601,
+		  0x79FB732B77622510, 0x18905F76A53755C6 },
+		{ 0xDDF25357CE95560A, 0x8B4AB8E4BA19E45C,
+		  0xD2E88688DD21F325, 0x8571FF1825885D85 }
+	},
+	{
+		{ 0x850046D410DDD64D, 0xAA6AE3C1A433827D,
+		  0x732205038D1490D9, 0xF6BB32E43DCF3A3B },
+		{ 0x2F3648D361BEE1A5, 0x152CD7CBEB236FF8,
+		  0x19A8FB0E92042DBE, 0x78C577510A5B8A3B }
+	},
+	{
+		{ 0xFFAC3F904EEBC127, 0xB027F84A087D81FB,
+		  0x66AD77DD87CBBC98, 0x26936A3FB6FF747E },
+		{ 0xB04C5C1FC983A7EB, 0x583E47AD0861FE1A,
+		  0x788208311A2EE98E, 0xD5F06A29E587CC07 }
+	},
+	{
+		{ 0x74B0B50D46918DCC, 0x4650A6EDC623C173,
+		  0x0CDAACACE8100AF2, 0x577362F541B0176B },
+		{ 0x2D96F24CE4CBABA6, 0x17628471FAD6F447,
+		  0x6B6C36DEE5DDD22E, 0x84B14C394C5AB863 }
+	},
+	{
+		{ 0xBE1B8AAEC45C61F5, 0x90EC649A94B9537D,
+		  0x941CB5AAD076C20C, 0xC9079605890523C8 },
+		{ 0xEB309B4AE7BA4F10, 0x73C568EFE5EB882B,
+		  0x3540A9877E7A1F68, 0x73A076BB2DD1E916 }
+	},
+	{
+		{ 0x403947373E77664A, 0x55AE744F346CEE3E,
+		  0xD50A961A5B17A3AD, 0x13074B5954213673 },
+		{ 0x93D36220D377E44B, 0x299C2B53ADFF14B5,
+		  0xF424D44CEF639F11, 0xA4C9916D4A07F75F }
+	},
+	{
+		{ 0x0746354EA0173B4F, 0x2BD20213D23C00F7,
+		  0xF43EAAB50C23BB08, 0x13BA5119C3123E03 },
+		{ 0x2847D0303F5B9D4D, 0x6742F2F25DA67BDD,
+		  0xEF933BDC77C94195, 0xEAEDD9156E240867 }
+	},
+	{
+		{ 0x27F14CD19499A78F, 0x462AB5C56F9B3455,
+		  0x8F90F02AF02CFC6B, 0xB763891EB265230D },
+		{ 0xF59DA3A9532D4977, 0x21E3327DCF9EBA15,
+		  0x123C7B84BE60BBF0, 0x56EC12F27706DF76 }
+	},
+	{
+		{ 0x75C96E8F264E20E8, 0xABE6BFED59A7A841,
+		  0x2CC09C0444C8EB00, 0xE05B3080F0C4E16B },
+		{ 0x1EB7777AA45F3314, 0x56AF7BEDCE5D45E3,
+		  0x2B6E019A88B12F1A, 0x086659CDFD835F9B }
+	},
+	{
+		{ 0x2C18DBD19DC21EC8, 0x98F9868A0FCF8139,
+		  0x737D2CD648250B49, 0xCC61C94724B3428F },
+		{ 0x0C2B407880DD9E76, 0xC43A8991383FBE08,
+		  0x5F7D2D65779BE5D2, 0x78719A54EB3B4AB5 }
+	},
+	{
+		{ 0xEA7D260A6245E404, 0x9DE407956E7FDFE0,
+		  0x1FF3A4158DAC1AB5, 0x3E7090F1649C9073 },
+		{ 0x1A7685612B944E88, 0x250F939EE57F61C8,
+		  0x0C0DAA891EAD643D, 0x68930023E125B88E }
+	},
+	{
+		{ 0x04B71AA7D2697768, 0xABDEDEF5CA345A33,
+		  0x2409D29DEE37385E, 0x4EE1DF77CB83E156 },
+		{ 0x0CAC12D91CBB5B43, 0x170ED2F6CA895637,
+		  0x28228CFA8ADE6D66, 0x7FF57C9553238ACA }
+	},
+	{
+		{ 0xCCC425634B2ED709, 0x0E356769856FD30D,
+		  0xBCBCD43F559E9811, 0x738477AC5395B759 },
+		{ 0x35752B90C00EE17F, 0x68748390742ED2E3,
+		  0x7CD06422BD1F5BC1, 0xFBC08769C9E7B797 }
+	},
+	{
+		{ 0xA242A35BB0CF664A, 0x126E48F77F9707E3,
+		  0x1717BF54C6832660, 0xFAAE7332FD12C72E },
+		{ 0x27B52DB7995D586B, 0xBE29569E832237C2,
+		  0xE8E4193E2A65E7DB, 0x152706DC2EAA1BBB }
+	},
+	{
+		{ 0x72BCD8B7BC60055B, 0x03CC23EE56E27E4B,
+		  0xEE337424E4819370, 0xE2AA0E430AD3DA09 },
+		{ 0x40B8524F6383C45D, 0xD766355442A41B25,
+		  0x64EFA6DE778A4797, 0x2042170A7079ADF4 }
+	}
+};
+
+/*
+ * Multiply the conventional generator of the curve by the provided
+ * integer. Return is written in *P.
+ *
+ * Assumptions:
+ *  - Integer is not 0, and is lower than the curve order.
+ * If this conditions is not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	point_mul_inner(P, P256_Gwin, k, klen);
+}
+
+/*
+ * Return 1 if all of the following hold:
+ *  - klen <= 32
+ *  - k != 0
+ *  - k is lower than the curve order
+ * Otherwise, return 0.
+ *
+ * Constant-time behaviour: only klen may be observable.
+ */
+static uint32_t
+check_scalar(const unsigned char *k, size_t klen)
+{
+	uint32_t z;
+	int32_t c;
+	size_t u;
+
+	if (klen > 32) {
+		return 0;
+	}
+	z = 0;
+	for (u = 0; u < klen; u ++) {
+		z |= k[u];
+	}
+	if (klen == 32) {
+		c = 0;
+		for (u = 0; u < klen; u ++) {
+			c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
+		}
+	} else {
+		c = -1;
+	}
+	return NEQ(z, 0) & LT0(c);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *k, size_t klen, int curve)
+{
+	uint32_t r;
+	p256_jacobian P;
+
+	(void)curve;
+	if (Glen != 65) {
+		return 0;
+	}
+	r = check_scalar(k, klen);
+	r &= point_decode(&P, G);
+	p256_mul(&P, k, klen);
+	r &= point_encode(G, &P);
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *k, size_t klen, int curve)
+{
+	p256_jacobian P;
+
+	(void)curve;
+	p256_mulgen(&P, k, klen);
+	point_encode(R, &P);
+	return 65;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We might want to use Shamir's trick here: make a composite
+	 * window of u*P+v*Q points, to merge the two doubling-ladders
+	 * into one. This, however, has some complications:
+	 *
+	 *  - During the computation, we may hit the point-at-infinity.
+	 *    Thus, we would need p256_add_complete_mixed() (complete
+	 *    formulas for point addition), with a higher cost (17 muls
+	 *    instead of 11).
+	 *
+	 *  - A 4-bit window would be too large, since it would involve
+	 *    16*16-1 = 255 points. For the same window size as in the
+	 *    p256_mul() case, we would need to reduce the window size
+	 *    to 2 bits, and thus perform twice as many non-doubling
+	 *    point additions.
+	 *
+	 *  - The window may itself contain the point-at-infinity, and
+	 *    thus cannot be in all generality be made of affine points.
+	 *    Instead, we would need to make it a window of points in
+	 *    Jacobian coordinates. Even p256_add_complete_mixed() would
+	 *    be inappropriate.
+	 *
+	 * For these reasons, the code below performs two separate
+	 * point multiplications, then computes the final point addition
+	 * (which is both a "normal" addition, and a doubling, to handle
+	 * all cases).
+	 */
+
+	p256_jacobian P, Q;
+	uint32_t r, t, s;
+	uint64_t z;
+
+	(void)curve;
+	if (len != 65) {
+		return 0;
+	}
+	r = point_decode(&P, A);
+	p256_mul(&P, x, xlen);
+	if (B == NULL) {
+		p256_mulgen(&Q, y, ylen);
+	} else {
+		r &= point_decode(&Q, B);
+		p256_mul(&Q, y, ylen);
+	}
+
+	/*
+	 * The final addition may fail in case both points are equal.
+	 */
+	t = p256_add(&P, &Q);
+	f256_final_reduce(P.z);
+	z = P.z[0] | P.z[1] | P.z[2] | P.z[3];
+	s = EQ((uint32_t)(z | (z >> 32)), 0);
+	p256_double(&Q);
+
+	/*
+	 * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   s = 0, t = 0   return P (normal addition)
+	 *   s = 0, t = 1   return P (normal addition)
+	 *   s = 1, t = 0   return Q (a 'double' case)
+	 *   s = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(s & ~t, &P, &Q, sizeof Q);
+	point_encode(A, &P);
+	r &= ~(s & t);
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m64 = {
+	(uint32_t)0x00800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m64_get(void)
+{
+	return &br_ec_p256_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m64_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/bearssl/src/int/i31_montmul.c b/src/bearssl/src/int/i31_montmul.c
index 8066808..758f8f4 100644
--- a/src/bearssl/src/int/i31_montmul.c
+++ b/src/bearssl/src/int/i31_montmul.c
@@ -29,16 +29,45 @@ void
 br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
 	const uint32_t *m, uint32_t m0i)
 {
+	/*
+	 * Each outer loop iteration computes:
+	 *   d <- (d + xu*y + f*m) / 2^31
+	 * We have xu <= 2^31-1 and f <= 2^31-1.
+	 * Thus, if d <= 2*m-1 on input, then:
+	 *   2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
+	 * and the new d value is less than 2*m.
+	 *
+	 * We represent d over 31-bit words, with an extra word 'dh'
+	 * which can thus be only 0 or 1.
+	 */
 	size_t len, len4, u, v;
-	uint64_t dh;
+	uint32_t dh;
 
 	len = (m[0] + 31) >> 5;
 	len4 = len & ~(size_t)3;
 	br_i31_zero(d, m[0]);
 	dh = 0;
 	for (u = 0; u < len; u ++) {
+		/*
+		 * The carry for each operation fits on 32 bits:
+		 *   d[v+1] <= 2^31-1
+		 *   xu*y[v+1] <= (2^31-1)*(2^31-1)
+		 *   f*m[v+1] <= (2^31-1)*(2^31-1)
+		 *   r <= 2^32-1
+		 *   (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
+		 * After division by 2^31, the new r is then at most 2^32-1
+		 *
+		 * Using a 32-bit carry has performance benefits on 32-bit
+		 * systems; however, on 64-bit architectures, we prefer to
+		 * keep the carry (r) in a 64-bit register, thus avoiding some
+		 * "clear high bits" operations.
+		 */
 		uint32_t f, xu;
-		uint64_t r, zh;
+#if BR_64
+		uint64_t r;
+#else
+		uint32_t r;
+#endif
 
 		xu = x[u + 1];
 		f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i);
@@ -73,9 +102,14 @@ br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
 			d[v] = (uint32_t)z & 0x7FFFFFFF;
 		}
 
-		zh = dh + r;
-		d[len] = (uint32_t)zh & 0x7FFFFFFF;
-		dh = zh >> 31;
+		/*
+		 * Since the new dh can only be 0 or 1, the addition of
+		 * the old dh with the carry MUST fit on 32 bits, and
+		 * thus can be done into dh itself.
+		 */
+		dh += r;
+		d[len] = dh & 0x7FFFFFFF;
+		dh >>= 31;
 	}
 
 	/*
diff --git a/src/bearssl/src/int/i31_mulacc.c b/src/bearssl/src/int/i31_mulacc.c
index 024d095..7410e54 100644
--- a/src/bearssl/src/int/i31_mulacc.c
+++ b/src/bearssl/src/int/i31_mulacc.c
@@ -45,7 +45,20 @@ br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	for (u = 0; u < blen; u ++) {
 		uint32_t f;
 		size_t v;
+
+		/*
+		 * Carry always fits on 31 bits; we want to keep it in a
+		 * 32-bit register on 32-bit architectures (on a 64-bit
+		 * architecture, cast down from 64 to 32 bits means
+		 * clearing the high bits, which is not free; on a 32-bit
+		 * architecture, the same operation really means ignoring
+		 * the top register, which has negative or zero cost).
+		 */
+#if BR_64
 		uint64_t cc;
+#else
+		uint32_t cc;
+#endif
 
 		f = b[1 + u];
 		cc = 0;
diff --git a/src/bearssl/src/int/i32_mulacc.c b/src/bearssl/src/int/i32_mulacc.c
index f62c782..55da385 100644
--- a/src/bearssl/src/int/i32_mulacc.c
+++ b/src/bearssl/src/int/i32_mulacc.c
@@ -36,7 +36,11 @@ br_i32_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	for (u = 0; u < blen; u ++) {
 		uint32_t f;
 		size_t v;
+#if BR_64
 		uint64_t cc;
+#else
+		uint32_t cc;
+#endif
 
 		f = b[1 + u];
 		cc = 0;
diff --git a/src/bearssl/src/kdf/shake.c b/src/bearssl/src/kdf/shake.c
new file mode 100644
index 0000000..80d7176
--- /dev/null
+++ b/src/bearssl/src/kdf/shake.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Round constants.
+ */
+static const uint64_t RC[] = {
+	0x0000000000000001, 0x0000000000008082,
+	0x800000000000808A, 0x8000000080008000,
+	0x000000000000808B, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009,
+	0x000000000000008A, 0x0000000000000088,
+	0x0000000080008009, 0x000000008000000A,
+	0x000000008000808B, 0x800000000000008B,
+	0x8000000000008089, 0x8000000000008003,
+	0x8000000000008002, 0x8000000000000080,
+	0x000000000000800A, 0x800000008000000A,
+	0x8000000080008081, 0x8000000000008080,
+	0x0000000080000001, 0x8000000080008008
+};
+
+/*
+ * XOR a block of data into the provided state. This supports only
+ * blocks whose length is a multiple of 64 bits.
+ */
+static void
+xor_block(uint64_t *A, const void *data, size_t rate)
+{
+	size_t u;
+
+	for (u = 0; u < rate; u += 8) {
+		A[u >> 3] ^= br_dec64le((const unsigned char *)data + u);
+	}
+}
+
+/*
+ * Process a block with the provided data. The data length must be a
+ * multiple of 8 (in bytes); normally, this is the "rate".
+ */
+static void
+process_block(uint64_t *A)
+{
+	uint64_t t0, t1, t2, t3, t4;
+	uint64_t tt0, tt1, tt2, tt3;
+	uint64_t t, kt;
+	uint64_t c0, c1, c2, c3, c4, bnn;
+	int j;
+
+	/*
+	 * Compute the 24 rounds. This loop is partially unrolled (each
+	 * iteration computes two rounds).
+	 */
+	for (j = 0; j < 24; j += 2) {
+
+		tt0 = A[ 1] ^ A[ 6];
+		tt1 = A[11] ^ A[16];
+		tt0 ^= A[21] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 4] ^ A[ 9];
+		tt3 = A[14] ^ A[19];
+		tt0 ^= A[24];
+		tt2 ^= tt3;
+		t0 = tt0 ^ tt2;
+
+		tt0 = A[ 2] ^ A[ 7];
+		tt1 = A[12] ^ A[17];
+		tt0 ^= A[22] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 0] ^ A[ 5];
+		tt3 = A[10] ^ A[15];
+		tt0 ^= A[20];
+		tt2 ^= tt3;
+		t1 = tt0 ^ tt2;
+
+		tt0 = A[ 3] ^ A[ 8];
+		tt1 = A[13] ^ A[18];
+		tt0 ^= A[23] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 1] ^ A[ 6];
+		tt3 = A[11] ^ A[16];
+		tt0 ^= A[21];
+		tt2 ^= tt3;
+		t2 = tt0 ^ tt2;
+
+		tt0 = A[ 4] ^ A[ 9];
+		tt1 = A[14] ^ A[19];
+		tt0 ^= A[24] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 2] ^ A[ 7];
+		tt3 = A[12] ^ A[17];
+		tt0 ^= A[22];
+		tt2 ^= tt3;
+		t3 = tt0 ^ tt2;
+
+		tt0 = A[ 0] ^ A[ 5];
+		tt1 = A[10] ^ A[15];
+		tt0 ^= A[20] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 3] ^ A[ 8];
+		tt3 = A[13] ^ A[18];
+		tt0 ^= A[23];
+		tt2 ^= tt3;
+		t4 = tt0 ^ tt2;
+
+		A[ 0] = A[ 0] ^ t0;
+		A[ 5] = A[ 5] ^ t0;
+		A[10] = A[10] ^ t0;
+		A[15] = A[15] ^ t0;
+		A[20] = A[20] ^ t0;
+		A[ 1] = A[ 1] ^ t1;
+		A[ 6] = A[ 6] ^ t1;
+		A[11] = A[11] ^ t1;
+		A[16] = A[16] ^ t1;
+		A[21] = A[21] ^ t1;
+		A[ 2] = A[ 2] ^ t2;
+		A[ 7] = A[ 7] ^ t2;
+		A[12] = A[12] ^ t2;
+		A[17] = A[17] ^ t2;
+		A[22] = A[22] ^ t2;
+		A[ 3] = A[ 3] ^ t3;
+		A[ 8] = A[ 8] ^ t3;
+		A[13] = A[13] ^ t3;
+		A[18] = A[18] ^ t3;
+		A[23] = A[23] ^ t3;
+		A[ 4] = A[ 4] ^ t4;
+		A[ 9] = A[ 9] ^ t4;
+		A[14] = A[14] ^ t4;
+		A[19] = A[19] ^ t4;
+		A[24] = A[24] ^ t4;
+		A[ 5] = (A[ 5] << 36) | (A[ 5] >> (64 - 36));
+		A[10] = (A[10] <<  3) | (A[10] >> (64 -  3));
+		A[15] = (A[15] << 41) | (A[15] >> (64 - 41));
+		A[20] = (A[20] << 18) | (A[20] >> (64 - 18));
+		A[ 1] = (A[ 1] <<  1) | (A[ 1] >> (64 -  1));
+		A[ 6] = (A[ 6] << 44) | (A[ 6] >> (64 - 44));
+		A[11] = (A[11] << 10) | (A[11] >> (64 - 10));
+		A[16] = (A[16] << 45) | (A[16] >> (64 - 45));
+		A[21] = (A[21] <<  2) | (A[21] >> (64 - 2));
+		A[ 2] = (A[ 2] << 62) | (A[ 2] >> (64 - 62));
+		A[ 7] = (A[ 7] <<  6) | (A[ 7] >> (64 -  6));
+		A[12] = (A[12] << 43) | (A[12] >> (64 - 43));
+		A[17] = (A[17] << 15) | (A[17] >> (64 - 15));
+		A[22] = (A[22] << 61) | (A[22] >> (64 - 61));
+		A[ 3] = (A[ 3] << 28) | (A[ 3] >> (64 - 28));
+		A[ 8] = (A[ 8] << 55) | (A[ 8] >> (64 - 55));
+		A[13] = (A[13] << 25) | (A[13] >> (64 - 25));
+		A[18] = (A[18] << 21) | (A[18] >> (64 - 21));
+		A[23] = (A[23] << 56) | (A[23] >> (64 - 56));
+		A[ 4] = (A[ 4] << 27) | (A[ 4] >> (64 - 27));
+		A[ 9] = (A[ 9] << 20) | (A[ 9] >> (64 - 20));
+		A[14] = (A[14] << 39) | (A[14] >> (64 - 39));
+		A[19] = (A[19] <<  8) | (A[19] >> (64 -  8));
+		A[24] = (A[24] << 14) | (A[24] >> (64 - 14));
+		bnn = ~A[12];
+		kt = A[ 6] | A[12];
+		c0 = A[ 0] ^ kt;
+		kt = bnn | A[18];
+		c1 = A[ 6] ^ kt;
+		kt = A[18] & A[24];
+		c2 = A[12] ^ kt;
+		kt = A[24] | A[ 0];
+		c3 = A[18] ^ kt;
+		kt = A[ 0] & A[ 6];
+		c4 = A[24] ^ kt;
+		A[ 0] = c0;
+		A[ 6] = c1;
+		A[12] = c2;
+		A[18] = c3;
+		A[24] = c4;
+		bnn = ~A[22];
+		kt = A[ 9] | A[10];
+		c0 = A[ 3] ^ kt;
+		kt = A[10] & A[16];
+		c1 = A[ 9] ^ kt;
+		kt = A[16] | bnn;
+		c2 = A[10] ^ kt;
+		kt = A[22] | A[ 3];
+		c3 = A[16] ^ kt;
+		kt = A[ 3] & A[ 9];
+		c4 = A[22] ^ kt;
+		A[ 3] = c0;
+		A[ 9] = c1;
+		A[10] = c2;
+		A[16] = c3;
+		A[22] = c4;
+		bnn = ~A[19];
+		kt = A[ 7] | A[13];
+		c0 = A[ 1] ^ kt;
+		kt = A[13] & A[19];
+		c1 = A[ 7] ^ kt;
+		kt = bnn & A[20];
+		c2 = A[13] ^ kt;
+		kt = A[20] | A[ 1];
+		c3 = bnn ^ kt;
+		kt = A[ 1] & A[ 7];
+		c4 = A[20] ^ kt;
+		A[ 1] = c0;
+		A[ 7] = c1;
+		A[13] = c2;
+		A[19] = c3;
+		A[20] = c4;
+		bnn = ~A[17];
+		kt = A[ 5] & A[11];
+		c0 = A[ 4] ^ kt;
+		kt = A[11] | A[17];
+		c1 = A[ 5] ^ kt;
+		kt = bnn | A[23];
+		c2 = A[11] ^ kt;
+		kt = A[23] & A[ 4];
+		c3 = bnn ^ kt;
+		kt = A[ 4] | A[ 5];
+		c4 = A[23] ^ kt;
+		A[ 4] = c0;
+		A[ 5] = c1;
+		A[11] = c2;
+		A[17] = c3;
+		A[23] = c4;
+		bnn = ~A[ 8];
+		kt = bnn & A[14];
+		c0 = A[ 2] ^ kt;
+		kt = A[14] | A[15];
+		c1 = bnn ^ kt;
+		kt = A[15] & A[21];
+		c2 = A[14] ^ kt;
+		kt = A[21] | A[ 2];
+		c3 = A[15] ^ kt;
+		kt = A[ 2] & A[ 8];
+		c4 = A[21] ^ kt;
+		A[ 2] = c0;
+		A[ 8] = c1;
+		A[14] = c2;
+		A[15] = c3;
+		A[21] = c4;
+		A[ 0] = A[ 0] ^ RC[j + 0];
+
+		tt0 = A[ 6] ^ A[ 9];
+		tt1 = A[ 7] ^ A[ 5];
+		tt0 ^= A[ 8] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[24] ^ A[22];
+		tt3 = A[20] ^ A[23];
+		tt0 ^= A[21];
+		tt2 ^= tt3;
+		t0 = tt0 ^ tt2;
+
+		tt0 = A[12] ^ A[10];
+		tt1 = A[13] ^ A[11];
+		tt0 ^= A[14] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 0] ^ A[ 3];
+		tt3 = A[ 1] ^ A[ 4];
+		tt0 ^= A[ 2];
+		tt2 ^= tt3;
+		t1 = tt0 ^ tt2;
+
+		tt0 = A[18] ^ A[16];
+		tt1 = A[19] ^ A[17];
+		tt0 ^= A[15] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[ 6] ^ A[ 9];
+		tt3 = A[ 7] ^ A[ 5];
+		tt0 ^= A[ 8];
+		tt2 ^= tt3;
+		t2 = tt0 ^ tt2;
+
+		tt0 = A[24] ^ A[22];
+		tt1 = A[20] ^ A[23];
+		tt0 ^= A[21] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[12] ^ A[10];
+		tt3 = A[13] ^ A[11];
+		tt0 ^= A[14];
+		tt2 ^= tt3;
+		t3 = tt0 ^ tt2;
+
+		tt0 = A[ 0] ^ A[ 3];
+		tt1 = A[ 1] ^ A[ 4];
+		tt0 ^= A[ 2] ^ tt1;
+		tt0 = (tt0 << 1) | (tt0 >> 63);
+		tt2 = A[18] ^ A[16];
+		tt3 = A[19] ^ A[17];
+		tt0 ^= A[15];
+		tt2 ^= tt3;
+		t4 = tt0 ^ tt2;
+
+		A[ 0] = A[ 0] ^ t0;
+		A[ 3] = A[ 3] ^ t0;
+		A[ 1] = A[ 1] ^ t0;
+		A[ 4] = A[ 4] ^ t0;
+		A[ 2] = A[ 2] ^ t0;
+		A[ 6] = A[ 6] ^ t1;
+		A[ 9] = A[ 9] ^ t1;
+		A[ 7] = A[ 7] ^ t1;
+		A[ 5] = A[ 5] ^ t1;
+		A[ 8] = A[ 8] ^ t1;
+		A[12] = A[12] ^ t2;
+		A[10] = A[10] ^ t2;
+		A[13] = A[13] ^ t2;
+		A[11] = A[11] ^ t2;
+		A[14] = A[14] ^ t2;
+		A[18] = A[18] ^ t3;
+		A[16] = A[16] ^ t3;
+		A[19] = A[19] ^ t3;
+		A[17] = A[17] ^ t3;
+		A[15] = A[15] ^ t3;
+		A[24] = A[24] ^ t4;
+		A[22] = A[22] ^ t4;
+		A[20] = A[20] ^ t4;
+		A[23] = A[23] ^ t4;
+		A[21] = A[21] ^ t4;
+		A[ 3] = (A[ 3] << 36) | (A[ 3] >> (64 - 36));
+		A[ 1] = (A[ 1] <<  3) | (A[ 1] >> (64 -  3));
+		A[ 4] = (A[ 4] << 41) | (A[ 4] >> (64 - 41));
+		A[ 2] = (A[ 2] << 18) | (A[ 2] >> (64 - 18));
+		A[ 6] = (A[ 6] <<  1) | (A[ 6] >> (64 -  1));
+		A[ 9] = (A[ 9] << 44) | (A[ 9] >> (64 - 44));
+		A[ 7] = (A[ 7] << 10) | (A[ 7] >> (64 - 10));
+		A[ 5] = (A[ 5] << 45) | (A[ 5] >> (64 - 45));
+		A[ 8] = (A[ 8] <<  2) | (A[ 8] >> (64 - 2));
+		A[12] = (A[12] << 62) | (A[12] >> (64 - 62));
+		A[10] = (A[10] <<  6) | (A[10] >> (64 -  6));
+		A[13] = (A[13] << 43) | (A[13] >> (64 - 43));
+		A[11] = (A[11] << 15) | (A[11] >> (64 - 15));
+		A[14] = (A[14] << 61) | (A[14] >> (64 - 61));
+		A[18] = (A[18] << 28) | (A[18] >> (64 - 28));
+		A[16] = (A[16] << 55) | (A[16] >> (64 - 55));
+		A[19] = (A[19] << 25) | (A[19] >> (64 - 25));
+		A[17] = (A[17] << 21) | (A[17] >> (64 - 21));
+		A[15] = (A[15] << 56) | (A[15] >> (64 - 56));
+		A[24] = (A[24] << 27) | (A[24] >> (64 - 27));
+		A[22] = (A[22] << 20) | (A[22] >> (64 - 20));
+		A[20] = (A[20] << 39) | (A[20] >> (64 - 39));
+		A[23] = (A[23] <<  8) | (A[23] >> (64 -  8));
+		A[21] = (A[21] << 14) | (A[21] >> (64 - 14));
+		bnn = ~A[13];
+		kt = A[ 9] | A[13];
+		c0 = A[ 0] ^ kt;
+		kt = bnn | A[17];
+		c1 = A[ 9] ^ kt;
+		kt = A[17] & A[21];
+		c2 = A[13] ^ kt;
+		kt = A[21] | A[ 0];
+		c3 = A[17] ^ kt;
+		kt = A[ 0] & A[ 9];
+		c4 = A[21] ^ kt;
+		A[ 0] = c0;
+		A[ 9] = c1;
+		A[13] = c2;
+		A[17] = c3;
+		A[21] = c4;
+		bnn = ~A[14];
+		kt = A[22] | A[ 1];
+		c0 = A[18] ^ kt;
+		kt = A[ 1] & A[ 5];
+		c1 = A[22] ^ kt;
+		kt = A[ 5] | bnn;
+		c2 = A[ 1] ^ kt;
+		kt = A[14] | A[18];
+		c3 = A[ 5] ^ kt;
+		kt = A[18] & A[22];
+		c4 = A[14] ^ kt;
+		A[18] = c0;
+		A[22] = c1;
+		A[ 1] = c2;
+		A[ 5] = c3;
+		A[14] = c4;
+		bnn = ~A[23];
+		kt = A[10] | A[19];
+		c0 = A[ 6] ^ kt;
+		kt = A[19] & A[23];
+		c1 = A[10] ^ kt;
+		kt = bnn & A[ 2];
+		c2 = A[19] ^ kt;
+		kt = A[ 2] | A[ 6];
+		c3 = bnn ^ kt;
+		kt = A[ 6] & A[10];
+		c4 = A[ 2] ^ kt;
+		A[ 6] = c0;
+		A[10] = c1;
+		A[19] = c2;
+		A[23] = c3;
+		A[ 2] = c4;
+		bnn = ~A[11];
+		kt = A[ 3] & A[ 7];
+		c0 = A[24] ^ kt;
+		kt = A[ 7] | A[11];
+		c1 = A[ 3] ^ kt;
+		kt = bnn | A[15];
+		c2 = A[ 7] ^ kt;
+		kt = A[15] & A[24];
+		c3 = bnn ^ kt;
+		kt = A[24] | A[ 3];
+		c4 = A[15] ^ kt;
+		A[24] = c0;
+		A[ 3] = c1;
+		A[ 7] = c2;
+		A[11] = c3;
+		A[15] = c4;
+		bnn = ~A[16];
+		kt = bnn & A[20];
+		c0 = A[12] ^ kt;
+		kt = A[20] | A[ 4];
+		c1 = bnn ^ kt;
+		kt = A[ 4] & A[ 8];
+		c2 = A[20] ^ kt;
+		kt = A[ 8] | A[12];
+		c3 = A[ 4] ^ kt;
+		kt = A[12] & A[16];
+		c4 = A[ 8] ^ kt;
+		A[12] = c0;
+		A[16] = c1;
+		A[20] = c2;
+		A[ 4] = c3;
+		A[ 8] = c4;
+		A[ 0] = A[ 0] ^ RC[j + 1];
+		t = A[ 5];
+		A[ 5] = A[18];
+		A[18] = A[11];
+		A[11] = A[10];
+		A[10] = A[ 6];
+		A[ 6] = A[22];
+		A[22] = A[20];
+		A[20] = A[12];
+		A[12] = A[19];
+		A[19] = A[15];
+		A[15] = A[24];
+		A[24] = A[ 8];
+		A[ 8] = t;
+		t = A[ 1];
+		A[ 1] = A[ 9];
+		A[ 9] = A[14];
+		A[14] = A[ 2];
+		A[ 2] = A[13];
+		A[13] = A[23];
+		A[23] = A[ 4];
+		A[ 4] = A[21];
+		A[21] = A[16];
+		A[16] = A[ 3];
+		A[ 3] = A[17];
+		A[17] = A[ 7];
+		A[ 7] = t;
+	}
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_init(br_shake_context *sc, int security_level)
+{
+	sc->rate = 200 - (size_t)(security_level >> 2);
+	sc->dptr = 0;
+	memset(sc->A, 0, sizeof sc->A);
+	sc->A[ 1] = ~(uint64_t)0;
+	sc->A[ 2] = ~(uint64_t)0;
+	sc->A[ 8] = ~(uint64_t)0;
+	sc->A[12] = ~(uint64_t)0;
+	sc->A[17] = ~(uint64_t)0;
+	sc->A[20] = ~(uint64_t)0;
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_inject(br_shake_context *sc, const void *data, size_t len)
+{
+	const unsigned char *buf;
+	size_t rate, dptr;
+
+	buf = data;
+	rate = sc->rate;
+	dptr = sc->dptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = rate - dptr;
+		if (clen > len) {
+			clen = len;
+		}
+		memcpy(sc->dbuf + dptr, buf, clen);
+		dptr += clen;
+		buf += clen;
+		len -= clen;
+		if (dptr == rate) {
+			xor_block(sc->A, sc->dbuf, rate);
+			process_block(sc->A);
+			dptr = 0;
+		}
+	}
+	sc->dptr = dptr;
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_flip(br_shake_context *sc)
+{
+	/*
+	 * We apply padding and pre-XOR the value into the state. We
+	 * set dptr to the end of the buffer, so that first call to
+	 * shake_extract() will process the block.
+	 */
+	if ((sc->dptr + 1) == sc->rate) {
+		sc->dbuf[sc->dptr ++] = 0x9F;
+	} else {
+		sc->dbuf[sc->dptr ++] = 0x1F;
+		memset(sc->dbuf + sc->dptr, 0x00, sc->rate - sc->dptr - 1);
+		sc->dbuf[sc->rate - 1] = 0x80;
+		sc->dptr = sc->rate;
+	}
+	xor_block(sc->A, sc->dbuf, sc->rate);
+}
+
+/* see bearssl_kdf.h */
+void
+br_shake_produce(br_shake_context *sc, void *out, size_t len)
+{
+	unsigned char *buf;
+	size_t dptr, rate;
+
+	buf = out;
+	dptr = sc->dptr;
+	rate = sc->rate;
+	while (len > 0) {
+		size_t clen;
+
+		if (dptr == rate) {
+			unsigned char *dbuf;
+			uint64_t *A;
+
+			A = sc->A;
+			dbuf = sc->dbuf;
+			process_block(A);
+			br_enc64le(dbuf +   0,  A[ 0]);
+			br_enc64le(dbuf +   8, ~A[ 1]);
+			br_enc64le(dbuf +  16, ~A[ 2]);
+			br_enc64le(dbuf +  24,  A[ 3]);
+			br_enc64le(dbuf +  32,  A[ 4]);
+			br_enc64le(dbuf +  40,  A[ 5]);
+			br_enc64le(dbuf +  48,  A[ 6]);
+			br_enc64le(dbuf +  56,  A[ 7]);
+			br_enc64le(dbuf +  64, ~A[ 8]);
+			br_enc64le(dbuf +  72,  A[ 9]);
+			br_enc64le(dbuf +  80,  A[10]);
+			br_enc64le(dbuf +  88,  A[11]);
+			br_enc64le(dbuf +  96, ~A[12]);
+			br_enc64le(dbuf + 104,  A[13]);
+			br_enc64le(dbuf + 112,  A[14]);
+			br_enc64le(dbuf + 120,  A[15]);
+			br_enc64le(dbuf + 128,  A[16]);
+			br_enc64le(dbuf + 136, ~A[17]);
+			br_enc64le(dbuf + 144,  A[18]);
+			br_enc64le(dbuf + 152,  A[19]);
+			br_enc64le(dbuf + 160, ~A[20]);
+			br_enc64le(dbuf + 168,  A[21]);
+			br_enc64le(dbuf + 176,  A[22]);
+			br_enc64le(dbuf + 184,  A[23]);
+			br_enc64le(dbuf + 192,  A[24]);
+			dptr = 0;
+		}
+		clen = rate - dptr;
+		if (clen > len) {
+			clen = len;
+		}
+		memcpy(buf, sc->dbuf + dptr, clen);
+		dptr += clen;
+		buf += clen;
+		len -= clen;
+	}
+	sc->dptr = dptr;
+}
diff --git a/src/bearssl/src/rand/sysrng.c b/src/bearssl/src/rand/sysrng.c
index bec06be..5a92114 100644
--- a/src/bearssl/src/rand/sysrng.c
+++ b/src/bearssl/src/rand/sysrng.c
@@ -25,6 +25,10 @@
 #define BR_ENABLE_INTRINSICS   1
 #include "inner.h"
 
+#if BR_USE_GETENTROPY
+#include <unistd.h>
+#endif
+
 #if BR_USE_URANDOM
 #include <sys/types.h>
 #include <unistd.h>
@@ -38,6 +42,9 @@
 #pragma comment(lib, "advapi32")
 #endif
 
+/*
+ * Seeder that uses the RDRAND opcodes (on x86 CPU).
+ */
 #if BR_RDRAND
 BR_TARGETS_X86_UP
 BR_TARGET("rdrnd")
@@ -57,9 +64,24 @@ seeder_rdrand(const br_prng_class **ctx)
 		 *
 		 * Intel recommends trying at least 10 times in case of
 		 * failure.
+		 *
+		 * AMD bug: there are reports that some AMD processors
+		 * have a bug that makes them fail silently after a
+		 * suspend/resume cycle, in which case RDRAND will report
+		 * a success but always return 0xFFFFFFFF.
+		 * see: https://bugzilla.kernel.org/show_bug.cgi?id=85911
+		 *
+		 * As a mitigation, if the 32-bit value is 0 or -1, then
+		 * it is considered a failure and tried again. This should
+		 * reliably detect the buggy case, at least. This also
+		 * implies that the selected seed values can never be
+		 * 0x00000000 or 0xFFFFFFFF, which is not a problem since
+		 * we are generating a seed for a PRNG, and we overdo it
+		 * a bit (we generate 32 bytes of randomness, and 256 bits
+		 * of entropy are really overkill).
 		 */
 		for (j = 0; j < 10; j ++) {
-			if (_rdrand32_step(&x)) {
+			if (_rdrand32_step(&x) && x != 0 && x != (uint32_t)-1) {
 				goto next_word;
 			}
 		}
@@ -80,9 +102,11 @@ rdrand_supported(void)
 	 */
 	return br_cpuid(0, 0, 0x40000000, 0);
 }
-
 #endif
 
+/*
+ * Seeder that uses /dev/urandom (on Unix-like systems).
+ */
 #if BR_USE_URANDOM
 static int
 seeder_urandom(const br_prng_class **ctx)
@@ -116,6 +140,32 @@ seeder_urandom(const br_prng_class **ctx)
 }
 #endif
 
+/*
+ * Seeder that uses getentropy() (backed by getrandom() on some systems,
+ * e.g. Linux). On failure, it will use the /dev/urandom seeder (if
+ * enabled).
+ */
+#if BR_USE_GETENTROPY
+static int
+seeder_getentropy(const br_prng_class **ctx)
+{
+	unsigned char tmp[32];
+
+	if (getentropy(tmp, sizeof tmp) == 0) {
+		(*ctx)->update(ctx, tmp, sizeof tmp);
+		return 1;
+	}
+#if BR_USE_URANDOM
+	return seeder_urandom(ctx);
+#else
+	return 0;
+#endif
+}
+#endif
+
+/*
+ * Seeder that uses CryptGenRandom() (on Windows).
+ */
 #if BR_USE_WIN32_RAND
 static int
 seeder_win32(const br_prng_class **ctx)
@@ -139,6 +189,29 @@ seeder_win32(const br_prng_class **ctx)
 }
 #endif
 
+/*
+ * An aggregate seeder that uses RDRAND, and falls back to an OS-provided
+ * source if RDRAND fails.
+ */
+#if BR_RDRAND && (BR_USE_GETENTROPY || BR_USE_URANDOM || BR_USE_WIN32_RAND)
+static int
+seeder_rdrand_with_fallback(const br_prng_class **ctx)
+{
+	if (!seeder_rdrand(ctx)) {
+#if BR_USE_GETENTROPY
+		return seeder_getentropy(ctx);
+#elif BR_USE_URANDOM
+		return seeder_urandom(ctx);
+#elif BR_USE_WIN32_RAND
+		return seeder_win32(ctx);
+#else
+#error "macro selection has gone wrong"
+#endif
+	}
+	return 1;
+}
+#endif
+
 /* see bearssl_rand.h */
 br_prng_seeder
 br_prng_seeder_system(const char **name)
@@ -148,10 +221,19 @@ br_prng_seeder_system(const char **name)
 		if (name != NULL) {
 			*name = "rdrand";
 		}
+#if BR_USE_GETENTROPY || BR_USE_URANDOM || BR_USE_WIN32_RAND
+		return &seeder_rdrand_with_fallback;
+#else
 		return &seeder_rdrand;
+#endif
 	}
 #endif
-#if BR_USE_URANDOM
+#if BR_USE_GETENTROPY
+	if (name != NULL) {
+		*name = "getentropy";
+	}
+	return &seeder_getentropy;
+#elif BR_USE_URANDOM
 	if (name != NULL) {
 		*name = "urandom";
 	}
@@ -161,9 +243,10 @@ br_prng_seeder_system(const char **name)
 		*name = "win32";
 	}
 	return &seeder_win32;
-#endif
+#else
 	if (name != NULL) {
 		*name = "none";
 	}
 	return 0;
+#endif
 }
diff --git a/src/bearssl/src/rsa/rsa_default_pss_sign.c b/src/bearssl/src/rsa/rsa_default_pss_sign.c
new file mode 100644
index 0000000..ce4f3e0
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_default_pss_sign.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+br_rsa_pss_sign
+br_rsa_pss_sign_get_default(void)
+{
+#if BR_INT128 || BR_UMUL128
+	return &br_rsa_i62_pss_sign;
+#elif BR_LOMUL
+	return &br_rsa_i15_pss_sign;
+#else
+	return &br_rsa_i31_pss_sign;
+#endif
+}
diff --git a/src/bearssl/src/rsa/rsa_default_pss_vrfy.c b/src/bearssl/src/rsa/rsa_default_pss_vrfy.c
new file mode 100644
index 0000000..e3a9ad9
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_default_pss_vrfy.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+br_rsa_pss_vrfy
+br_rsa_pss_vrfy_get_default(void)
+{
+#if BR_INT128 || BR_UMUL128
+	return &br_rsa_i62_pss_vrfy;
+#elif BR_LOMUL
+	return &br_rsa_i15_pss_vrfy;
+#else
+	return &br_rsa_i31_pss_vrfy;
+#endif
+}
diff --git a/src/bearssl/src/rsa/rsa_i15_keygen.c b/src/bearssl/src/rsa/rsa_i15_keygen.c
index 1c011fe..e8da419 100644
--- a/src/bearssl/src/rsa/rsa_i15_keygen.c
+++ b/src/bearssl/src/rsa/rsa_i15_keygen.c
@@ -318,9 +318,9 @@ mkprime(const br_prng_class **rng, uint16_t *x, uint32_t esize,
 			continue;
 		}
 		if ((pubexp == 3 && m3 == 1)
-			|| (pubexp == 5 && m5 == 5)
-			|| (pubexp == 7 && m5 == 7)
-			|| (pubexp == 11 && m5 == 11))
+			|| (pubexp == 5 && m5 == 1)
+			|| (pubexp == 7 && m7 == 1)
+			|| (pubexp == 11 && m11 == 1))
 		{
 			continue;
 		}
diff --git a/src/bearssl/src/rsa/rsa_i15_modulus.c b/src/bearssl/src/rsa/rsa_i15_modulus.c
index d61c794..16458c3 100644
--- a/src/bearssl/src/rsa/rsa_i15_modulus.c
+++ b/src/bearssl/src/rsa/rsa_i15_modulus.c
@@ -28,7 +28,7 @@
 size_t
 br_rsa_i15_compute_modulus(void *n, const br_rsa_private_key *sk)
 {
-	uint16_t tmp[2 * ((BR_MAX_RSA_SIZE + 14) / 15) + 5];
+	uint16_t tmp[4 * (((BR_MAX_RSA_SIZE / 2) + 14) / 15) + 5];
 	uint16_t *t, *p, *q;
 	const unsigned char *pbuf, *qbuf;
 	size_t nlen, plen, qlen, tlen;
diff --git a/src/bearssl/src/rsa/rsa_i15_pss_sign.c b/src/bearssl/src/rsa/rsa_i15_pss_sign.c
new file mode 100644
index 0000000..dd9385b
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i15_pss_sign.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i15_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i15_private(x, sk);
+}
diff --git a/src/bearssl/src/rsa/rsa_i15_pss_vrfy.c b/src/bearssl/src/rsa/rsa_i15_pss_vrfy.c
new file mode 100644
index 0000000..7d9f2cb
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i15_pss_vrfy.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i15_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i15_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
diff --git a/src/bearssl/src/rsa/rsa_i31_keygen_inner.c b/src/bearssl/src/rsa/rsa_i31_keygen_inner.c
index 9ec881b..98df445 100644
--- a/src/bearssl/src/rsa/rsa_i31_keygen_inner.c
+++ b/src/bearssl/src/rsa/rsa_i31_keygen_inner.c
@@ -340,9 +340,9 @@ mkprime(const br_prng_class **rng, uint32_t *x, uint32_t esize,
 			continue;
 		}
 		if ((pubexp == 3 && m3 == 1)
-			|| (pubexp == 5 && m5 == 5)
-			|| (pubexp == 7 && m5 == 7)
-			|| (pubexp == 11 && m5 == 11))
+			|| (pubexp == 5 && m5 == 1)
+			|| (pubexp == 7 && m7 == 1)
+			|| (pubexp == 11 && m11 == 1))
 		{
 			continue;
 		}
diff --git a/src/bearssl/src/rsa/rsa_i31_modulus.c b/src/bearssl/src/rsa/rsa_i31_modulus.c
index c469cf3..f5f997f 100644
--- a/src/bearssl/src/rsa/rsa_i31_modulus.c
+++ b/src/bearssl/src/rsa/rsa_i31_modulus.c
@@ -28,7 +28,7 @@
 size_t
 br_rsa_i31_compute_modulus(void *n, const br_rsa_private_key *sk)
 {
-	uint32_t tmp[2 * ((BR_MAX_RSA_SIZE + 30) / 31) + 5];
+	uint32_t tmp[4 * (((BR_MAX_RSA_SIZE / 2) + 30) / 31) + 5];
 	uint32_t *t, *p, *q;
 	const unsigned char *pbuf, *qbuf;
 	size_t nlen, plen, qlen, tlen;
diff --git a/src/bearssl/src/rsa/rsa_i31_pss_sign.c b/src/bearssl/src/rsa/rsa_i31_pss_sign.c
new file mode 100644
index 0000000..b06f3e2
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i31_pss_sign.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i31_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i31_private(x, sk);
+}
diff --git a/src/bearssl/src/rsa/rsa_i31_pss_vrfy.c b/src/bearssl/src/rsa/rsa_i31_pss_vrfy.c
new file mode 100644
index 0000000..77a9b28
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i31_pss_vrfy.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i31_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i31_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
diff --git a/src/bearssl/src/rsa/rsa_i32_pss_sign.c b/src/bearssl/src/rsa/rsa_i32_pss_sign.c
new file mode 100644
index 0000000..0f72f92
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i32_pss_sign.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i32_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i32_private(x, sk);
+}
diff --git a/src/bearssl/src/rsa/rsa_i32_pss_vrfy.c b/src/bearssl/src/rsa/rsa_i32_pss_vrfy.c
new file mode 100644
index 0000000..2e70d23
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i32_pss_vrfy.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i32_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i32_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
diff --git a/src/bearssl/src/rsa/rsa_i62_pss_sign.c b/src/bearssl/src/rsa/rsa_i62_pss_sign.c
new file mode 100644
index 0000000..7232f6d
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i62_pss_sign.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i62_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x)
+{
+	if (!br_rsa_pss_sig_pad(rng, hf_data, hf_mgf1, hash,
+		salt_len, sk->n_bitlen, x))
+	{
+		return 0;
+	}
+	return br_rsa_i62_private(x, sk);
+}
+
+/* see bearssl_rsa.h */
+br_rsa_pss_sign
+br_rsa_i62_pss_sign_get(void)
+{
+	return &br_rsa_i62_pss_sign;
+}
+
+#else
+
+/* see bearssl_rsa.h */
+br_rsa_pss_sign
+br_rsa_i62_pss_sign_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/bearssl/src/rsa/rsa_i62_pss_vrfy.c b/src/bearssl/src/rsa/rsa_i62_pss_vrfy.c
new file mode 100644
index 0000000..e726e82
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_i62_pss_vrfy.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i62_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk)
+{
+	unsigned char sig[BR_MAX_RSA_SIZE >> 3];
+
+	if (xlen > (sizeof sig)) {
+		return 0;
+	}
+	memcpy(sig, x, xlen);
+	if (!br_rsa_i62_public(sig, xlen, pk)) {
+		return 0;
+	}
+	return br_rsa_pss_sig_unpad(hf_data, hf_mgf1,
+		hash, salt_len, pk, sig);
+}
+
+/* see bearssl_rsa.h */
+br_rsa_pss_vrfy
+br_rsa_i62_pss_vrfy_get(void)
+{
+	return &br_rsa_i62_pss_vrfy;
+}
+
+#else
+
+/* see bearssl_rsa.h */
+br_rsa_pss_vrfy
+br_rsa_i62_pss_vrfy_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/bearssl/src/rsa/rsa_pss_sig_pad.c b/src/bearssl/src/rsa/rsa_pss_sig_pad.c
new file mode 100644
index 0000000..13e9027
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_pss_sig_pad.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+uint32_t
+br_rsa_pss_sig_pad(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	uint32_t n_bitlen, unsigned char *x)
+{
+	size_t xlen, hash_len;
+	br_hash_compat_context hc;
+	unsigned char *salt, *seed;
+
+	hash_len = br_digest_size(hf_data);
+
+	/*
+	 * The padded string is one bit smaller than the modulus;
+	 * notably, if the modulus length is equal to 1 modulo 8, then
+	 * the padded string will be one _byte_ smaller, and the first
+	 * byte will be set to 0. We apply these transformations here.
+	 */
+	n_bitlen --;
+	if ((n_bitlen & 7) == 0) {
+		*x ++ = 0;
+	}
+	xlen = (n_bitlen + 7) >> 3;
+
+	/*
+	 * Check that the modulus is large enough for the hash value
+	 * length combined with the intended salt length.
+	 */
+	if (hash_len > xlen || salt_len > xlen
+		|| (hash_len + salt_len + 2) > xlen)
+	{
+		return 0;
+	}
+
+	/*
+	 * Produce a random salt.
+	 */
+	salt = x + xlen - hash_len - salt_len - 1;
+	if (salt_len != 0) {
+		(*rng)->generate(rng, salt, salt_len);
+	}
+
+	/*
+	 * Compute the seed for MGF1.
+	 */
+	seed = x + xlen - hash_len - 1;
+	hf_data->init(&hc.vtable);
+	memset(seed, 0, 8);
+	hf_data->update(&hc.vtable, seed, 8);
+	hf_data->update(&hc.vtable, hash, hash_len);
+	hf_data->update(&hc.vtable, salt, salt_len);
+	hf_data->out(&hc.vtable, seed);
+
+	/*
+	 * Prepare string PS (padded salt). The salt is already at the
+	 * right place.
+	 */
+	memset(x, 0, xlen - salt_len - hash_len - 2);
+	x[xlen - salt_len - hash_len - 2] = 0x01;
+
+	/*
+	 * Generate the mask and XOR it into PS.
+	 */
+	br_mgf1_xor(x, xlen - hash_len - 1, hf_mgf1, seed, hash_len);
+
+	/*
+	 * Clear the top bits to ensure the value is lower than the
+	 * modulus.
+	 */
+	x[0] &= 0xFF >> (((uint32_t)xlen << 3) - n_bitlen);
+
+	/*
+	 * The seed (H) is already in the right place. We just set the
+	 * last byte.
+	 */
+	x[xlen - 1] = 0xBC;
+
+	return 1;
+}
diff --git a/src/bearssl/src/rsa/rsa_pss_sig_unpad.c b/src/bearssl/src/rsa/rsa_pss_sig_unpad.c
new file mode 100644
index 0000000..a9f8ca3
--- /dev/null
+++ b/src/bearssl/src/rsa/rsa_pss_sig_unpad.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+uint32_t
+br_rsa_pss_sig_unpad(const br_hash_class *hf_data,
+	const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_public_key *pk, unsigned char *x)
+{
+	size_t u, xlen, hash_len;
+	br_hash_compat_context hc;
+	unsigned char *seed, *salt;
+	unsigned char tmp[64];
+	uint32_t r, n_bitlen;
+
+	hash_len = br_digest_size(hf_data);
+
+	/*
+	 * Value r will be set to a non-zero value is any test fails.
+	 */
+	r = 0;
+
+	/*
+	 * The value bit length (as an integer) must be strictly less than
+	 * that of the modulus.
+	 */
+	for (u = 0; u < pk->nlen; u ++) {
+		if (pk->n[u] != 0) {
+			break;
+		}
+	}
+	if (u == pk->nlen) {
+		return 0;
+	}
+	n_bitlen = BIT_LENGTH(pk->n[u]) + ((uint32_t)(pk->nlen - u - 1) << 3);
+	n_bitlen --;
+	if ((n_bitlen & 7) == 0) {
+		r |= *x ++;
+	} else {
+		r |= x[0] & (0xFF << (n_bitlen & 7));
+	}
+	xlen = (n_bitlen + 7) >> 3;
+
+	/*
+	 * Check that the modulus is large enough for the hash value
+	 * length combined with the intended salt length.
+	 */
+	if (hash_len > xlen || salt_len > xlen
+		|| (hash_len + salt_len + 2) > xlen)
+	{
+		return 0;
+	}
+
+	/*
+	 * Check value of rightmost byte.
+	 */
+	r |= x[xlen - 1] ^ 0xBC;
+
+	/*
+	 * Generate the mask and XOR it into the first bytes to reveal PS;
+	 * we must also mask out the leading bits.
+	 */
+	seed = x + xlen - hash_len - 1;
+	br_mgf1_xor(x, xlen - hash_len - 1, hf_mgf1, seed, hash_len);
+	if ((n_bitlen & 7) != 0) {
+		x[0] &= 0xFF >> (8 - (n_bitlen & 7));
+	}
+
+	/*
+	 * Check that all padding bytes have the expected value.
+	 */
+	for (u = 0; u < (xlen - hash_len - salt_len - 2); u ++) {
+		r |= x[u];
+	}
+	r |= x[xlen - hash_len - salt_len - 2] ^ 0x01;
+
+	/*
+	 * Recompute H.
+	 */
+	salt = x + xlen - hash_len - salt_len - 1;
+	hf_data->init(&hc.vtable);
+	memset(tmp, 0, 8);
+	hf_data->update(&hc.vtable, tmp, 8);
+	hf_data->update(&hc.vtable, hash, hash_len);
+	hf_data->update(&hc.vtable, salt, salt_len);
+	hf_data->out(&hc.vtable, tmp);
+
+	/*
+	 * Check that the recomputed H value matches the one appearing
+	 * in the string.
+	 */
+	for (u = 0; u < hash_len; u ++) {
+		r |= tmp[u] ^ x[(xlen - salt_len - 1) + u];
+	}
+
+	return EQ0(r);
+}
diff --git a/src/bearssl/src/ssl/ssl_client_full.c b/src/bearssl/src/ssl/ssl_client_full.c
index bc34e92..fd35b3c 100644
--- a/src/bearssl/src/ssl/ssl_client_full.c
+++ b/src/bearssl/src/ssl/ssl_client_full.c
@@ -119,7 +119,6 @@ br_ssl_client_init_full(br_ssl_client_context *cc,
 	 * to TLS-1.2 (inclusive).
 	 */
 	br_ssl_client_zero(cc);
-	memset(xc, 0, sizeof *xc);
 	br_ssl_engine_set_versions(&cc->eng, BR_TLS10, BR_TLS12);
 
 	/*
diff --git a/src/bearssl/src/ssl/ssl_engine.c b/src/bearssl/src/ssl/ssl_engine.c
index f4ffe18..f59fe1a 100644
--- a/src/bearssl/src/ssl/ssl_engine.c
+++ b/src/bearssl/src/ssl/ssl_engine.c
@@ -1232,6 +1232,21 @@ void
 br_ssl_engine_close(br_ssl_engine_context *cc)
 {
 	if (!br_ssl_engine_closed(cc)) {
+		/*
+		 * If we are not already closed, then we need to
+		 * initiate the closure. Once closing, any incoming
+		 * application data is discarded; we should also discard
+		 * application data which is already there but has not
+		 * been acknowledged by the application yet (this mimics
+		 * usual semantics on BSD sockets: you cannot read()
+		 * once you called close(), even if there was some
+		 * unread data already buffered).
+		 */
+		size_t len;
+
+		if (br_ssl_engine_recvapp_buf(cc, &len) != NULL && len != 0) {
+			br_ssl_engine_recvapp_ack(cc, len);
+		}
 		jump_handshake(cc, 1);
 	}
 }
diff --git a/src/bearssl/src/ssl/ssl_io.c b/src/bearssl/src/ssl/ssl_io.c
index e929228..1952615 100644
--- a/src/bearssl/src/ssl/ssl_io.c
+++ b/src/bearssl/src/ssl/ssl_io.c
@@ -48,8 +48,8 @@ br_sslio_init(br_sslio_context *ctx,
  * combination of both (the combination matches either). When a match is
  * achieved, this function returns 0. On error, it returns -1.
  */
-int
-br_run_until(br_sslio_context *ctx, unsigned target)
+static int
+run_until(br_sslio_context *ctx, unsigned target)
 {
 	for (;;) {
 		unsigned state;
@@ -152,7 +152,7 @@ br_sslio_read(br_sslio_context *ctx, void *dst, size_t len)
 	if (len == 0) {
 		return 0;
 	}
-	if (br_run_until(ctx, BR_SSL_RECVAPP) < 0) {
+	if (run_until(ctx, BR_SSL_RECVAPP) < 0) {
 		return -1;
 	}
 	buf = br_ssl_engine_recvapp_buf(ctx->engine, &alen);
@@ -194,7 +194,7 @@ br_sslio_write(br_sslio_context *ctx, const void *src, size_t len)
 	if (len == 0) {
 		return 0;
 	}
-	if (br_run_until(ctx, BR_SSL_SENDAPP) < 0) {
+	if (run_until(ctx, BR_SSL_SENDAPP) < 0) {
 		return -1;
 	}
 	buf = br_ssl_engine_sendapp_buf(ctx->engine, &alen);
@@ -238,7 +238,7 @@ br_sslio_flush(br_sslio_context *ctx)
 	 * first sent down the wire before considering anything else.
 	 */
 	br_ssl_engine_flush(ctx->engine, 0);
-	return br_run_until(ctx, BR_SSL_SENDAPP | BR_SSL_RECVAPP);
+	return run_until(ctx, BR_SSL_SENDAPP | BR_SSL_RECVAPP);
 }
 
 /* see bearssl_ssl.h */
@@ -252,7 +252,7 @@ br_sslio_close(br_sslio_context *ctx)
 		 */
 		size_t len;
 
-		br_run_until(ctx, BR_SSL_RECVAPP);
+		run_until(ctx, BR_SSL_RECVAPP);
 		if (br_ssl_engine_recvapp_buf(ctx->engine, &len) != NULL) {
 			br_ssl_engine_recvapp_ack(ctx->engine, len);
 		}
diff --git a/src/bearssl/src/x509/asn1.t0 b/src/bearssl/src/x509/asn1.t0
index ba59252..c329514 100644
--- a/src/bearssl/src/x509/asn1.t0
+++ b/src/bearssl/src/x509/asn1.t0
@@ -480,7 +480,7 @@ OID: id-at-commonName            2.5.4.3
 \ 66 noncharacters, and also the surrogate range; this function does NOT
 \ check that the value is in the 0..10FFFF range.
 : valid-unicode? ( val -- bool )
-	dup 0xFDD0 0xFEDF between? if drop 0 ret then
+	dup 0xFDD0 0xFDEF between? if drop 0 ret then
 	dup 0xD800 0xDFFF between? if drop 0 ret then
 	0xFFFF and 0xFFFE < ;
 
diff --git a/src/bearssl/src/x509/skey_decoder.c b/src/bearssl/src/x509/skey_decoder.c
index f4e43e7..9e285d7 100644
--- a/src/bearssl/src/x509/skey_decoder.c
+++ b/src/bearssl/src/x509/skey_decoder.c
@@ -155,7 +155,7 @@ static const unsigned char t0_codeblock[] = {
 	0x02, 0x06, 0x1E, 0x00, 0x00, 0x19, 0x19, 0x00, 0x00, 0x01, 0x0B, 0x00,
 	0x00, 0x01, 0x00, 0x20, 0x14, 0x06, 0x08, 0x01, 0x01, 0x21, 0x20, 0x22,
 	0x20, 0x04, 0x75, 0x13, 0x00, 0x00, 0x01,
-	T0_INT2(3 * BR_X509_BUFSIZE_KEY), 0x00, 0x01, 0x01, 0x87, 0xFF, 0xFF,
+	T0_INT2(3 * BR_X509_BUFSIZE_SIG), 0x00, 0x01, 0x01, 0x87, 0xFF, 0xFF,
 	0x7F, 0x54, 0x57, 0x01, 0x02, 0x3E, 0x55, 0x01, 0x01, 0x0E, 0x06, 0x02,
 	0x30, 0x16, 0x57, 0x01, 0x02, 0x19, 0x0D, 0x06, 0x06, 0x13, 0x3B, 0x44,
 	0x32, 0x04, 0x1C, 0x01, 0x04, 0x19, 0x0D, 0x06, 0x08, 0x13, 0x3B, 0x01,
diff --git a/src/bearssl/src/x509/skey_decoder.t0 b/src/bearssl/src/x509/skey_decoder.t0
index 5b59421..f00e614 100644
--- a/src/bearssl/src/x509/skey_decoder.t0
+++ b/src/bearssl/src/x509/skey_decoder.t0
@@ -80,7 +80,7 @@ cc: read-blob-inner ( addr len -- addr len ) {
 
 \ Get the length of the key_data buffer.
 : len-key_data
-	CX 0 8191 { 3 * BR_X509_BUFSIZE_KEY } ;
+	CX 0 8191 { 3 * BR_X509_BUFSIZE_SIG } ;
 
 \ Get the address and length for the key_data buffer.
 : addr-len-key_data ( -- addr len )
diff --git a/src/bearssl/src/x509/x509_minimal.c b/src/bearssl/src/x509/x509_minimal.c
index 3b876ef..6103c08 100644
--- a/src/bearssl/src/x509/x509_minimal.c
+++ b/src/bearssl/src/x509/x509_minimal.c
@@ -703,7 +703,7 @@ static const unsigned char t0_codeblock[] = {
 	0x76, 0x00, 0x00, 0x01, 0x00, 0x30, 0x31, 0x0B, 0x42, 0x00, 0x00, 0x01,
 	0x81, 0x70, 0x00, 0x00, 0x01, 0x82, 0x0D, 0x00, 0x00, 0x01, 0x82, 0x22,
 	0x00, 0x00, 0x01, 0x82, 0x05, 0x00, 0x00, 0x01, 0x03, 0x33, 0x01, 0x03,
-	0x33, 0x00, 0x00, 0x25, 0x01, 0x83, 0xFB, 0x50, 0x01, 0x83, 0xFD, 0x5F,
+	0x33, 0x00, 0x00, 0x25, 0x01, 0x83, 0xFB, 0x50, 0x01, 0x83, 0xFB, 0x6F,
 	0x72, 0x06, 0x04, 0x24, 0x01, 0x00, 0x00, 0x25, 0x01, 0x83, 0xB0, 0x00,
 	0x01, 0x83, 0xBF, 0x7F, 0x72, 0x06, 0x04, 0x24, 0x01, 0x00, 0x00, 0x01,
 	0x83, 0xFF, 0x7F, 0x15, 0x01, 0x83, 0xFF, 0x7E, 0x0D, 0x00
diff --git a/src/bearssl/src/x509/x509_minimal.t0 b/src/bearssl/src/x509/x509_minimal.t0
index 1e60016..50995dc 100644
--- a/src/bearssl/src/x509/x509_minimal.t0
+++ b/src/bearssl/src/x509/x509_minimal.t0
@@ -106,7 +106,7 @@ preamble {
  *     -- Extensions: extension values are processed in due order.
  *
  *        -- Basic Constraints: for all certificates except EE, must be
- *        present, indicate a CA, and have a path legnth compatible with
+ *        present, indicate a CA, and have a path length compatible with
  *        the chain length so far.
  *
  *        -- Key Usage: for the EE, if present, must allow signatures
diff --git a/src/bearssl_ec.h b/src/bearssl_ec.h
index b03984a..9c76c95 100644
--- a/src/bearssl_ec.h
+++ b/src/bearssl_ec.h
@@ -108,7 +108,7 @@ extern "C" {
  *
  *   - The multipliers (integers) MUST be lower than the subgroup order.
  *     If this property is not met, then the result is indeterminate,
- *     but an error value is not ncessearily returned.
+ *     but an error value is not necessarily returned.
  * 
  *
  * ## ECDSA
@@ -451,6 +451,42 @@ extern const br_ec_impl br_ec_p256_m15;
  */
 extern const br_ec_impl br_ec_p256_m31;
 
+/**
+ * \brief EC implementation "m62" (specialised code) for P-256.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 64 bits, with a 128-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_p256_m62_get()` to dynamically obtain a pointer
+ * to that implementation.
+ */
+extern const br_ec_impl br_ec_p256_m62;
+
+/**
+ * \brief Get the "m62" implementation of P-256, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_p256_m62_get(void);
+
+/**
+ * \brief EC implementation "m64" (specialised code) for P-256.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 64 bits, with a 128-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_p256_m64_get()` to dynamically obtain a pointer
+ * to that implementation.
+ */
+extern const br_ec_impl br_ec_p256_m64;
+
+/**
+ * \brief Get the "m64" implementation of P-256, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_p256_m64_get(void);
+
 /**
  * \brief EC implementation "i15" (generic code) for Curve25519.
  *
@@ -507,6 +543,54 @@ extern const br_ec_impl br_ec_c25519_m15;
  */
 extern const br_ec_impl br_ec_c25519_m31;
 
+/**
+ * \brief EC implementation "m62" (specialised code) for Curve25519.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 62 bits, with a 124-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_c25519_m62_get()` to dynamically obtain a pointer
+ * to that implementation. Due to the specificities of the curve
+ * definition, the following applies:
+ *
+ *   - `muladd()` is not implemented (the function returns 0 systematically).
+ *   - `order()` returns 2^255-1, since the point multiplication algorithm
+ *     accepts any 32-bit integer as input (it clears the top bit and low
+ *     three bits systematically).
+ */
+extern const br_ec_impl br_ec_c25519_m62;
+
+/**
+ * \brief Get the "m62" implementation of Curve25519, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_c25519_m62_get(void);
+
+/**
+ * \brief EC implementation "m64" (specialised code) for Curve25519.
+ *
+ * This implementation uses custom code relying on multiplication of
+ * integers up to 64 bits, with a 128-bit result. This implementation is
+ * defined only on platforms that offer the 64x64->128 multiplication
+ * support; use `br_ec_c25519_m64_get()` to dynamically obtain a pointer
+ * to that implementation. Due to the specificities of the curve
+ * definition, the following applies:
+ *
+ *   - `muladd()` is not implemented (the function returns 0 systematically).
+ *   - `order()` returns 2^255-1, since the point multiplication algorithm
+ *     accepts any 32-bit integer as input (it clears the top bit and low
+ *     three bits systematically).
+ */
+extern const br_ec_impl br_ec_c25519_m64;
+
+/**
+ * \brief Get the "m64" implementation of Curve25519, if available.
+ *
+ * \return  the implementation, or 0.
+ */
+const br_ec_impl *br_ec_c25519_m64_get(void);
+
 /**
  * \brief Aggregate EC implementation "m15".
  *
diff --git a/src/bearssl_hash.h b/src/bearssl_hash.h
index 3b15ba7..ca4fa26 100644
--- a/src/bearssl_hash.h
+++ b/src/bearssl_hash.h
@@ -724,7 +724,7 @@ void br_sha256_update(br_sha256_context *ctx, const void *data, size_t len);
  */
 void br_sha256_out(const br_sha256_context *ctx, void *out);
 
-#if BR_DOXYGEN_IGNORE
+#ifdef BR_DOXYGEN_IGNORE
 /**
  * \brief Save SHA-256 running state.
  *
@@ -742,7 +742,7 @@ uint64_t br_sha256_state(const br_sha256_context *ctx, void *out);
 #define br_sha256_state       br_sha224_state
 #endif
 
-#if BR_DOXYGEN_IGNORE
+#ifdef BR_DOXYGEN_IGNORE
 /**
  * \brief Restore SHA-256 running state.
  *
diff --git a/src/bearssl_kdf.h b/src/bearssl_kdf.h
index f018d7e..955b843 100644
--- a/src/bearssl_kdf.h
+++ b/src/bearssl_kdf.h
@@ -81,6 +81,30 @@ extern "C" {
  * Note that the HKDF total output size (the number of bytes that
  * HKDF-Expand is willing to produce) is limited: if the hash output size
  * is _n_ bytes, then the maximum output size is _255*n_.
+ *
+ * ## SHAKE
+ *
+ * SHAKE is defined in
+ * [FIPS 202](https://csrc.nist.gov/publications/detail/fips/202/final)
+ * under two versions: SHAKE128 and SHAKE256, offering an alleged
+ * "security level" of 128 and 256 bits, respectively (SHAKE128 is
+ * about 20 to 25% faster than SHAKE256). SHAKE internally relies on
+ * the Keccak family of sponge functions, not on any externally provided
+ * hash function. Contrary to HKDF, SHAKE does not have a concept of
+ * either a "salt" or an "info" string. The API consists in four
+ * functions:
+ *
+ *  - `br_shake_init()`: initialize a SHAKE context for a given
+ *    security level.
+ *
+ *  - `br_shake_inject()`: inject more input bytes. This function may be
+ *    called repeatedly if the input data is provided by chunks.
+ *
+ *  - `br_shake_flip()`: end the data injection process, and start the
+ *    data production process.
+ *
+ *  - `br_shake_produce()`: get the next bytes of output. This function
+ *    may be called several times to obtain the full output by chunks.
  */
 
 /**
@@ -178,6 +202,81 @@ void br_hkdf_flip(br_hkdf_context *hc);
 size_t br_hkdf_produce(br_hkdf_context *hc,
 	const void *info, size_t info_len, void *out, size_t out_len);
 
+/**
+ * \brief SHAKE context.
+ *
+ * The HKDF context is initialized with a "security level". The internal
+ * notion is called "capacity"; the capacity is twice the security level
+ * (for instance, SHAKE128 has capacity 256).
+ *
+ * The caller is responsible for allocating the context where
+ * appropriate. Context initialisation and usage incurs no dynamic
+ * allocation, so there is no release function.
+ */
+typedef struct {
+#ifndef BR_DOXYGEN_IGNORE
+	unsigned char dbuf[200];
+	size_t dptr;
+	size_t rate;
+	uint64_t A[25];
+#endif
+} br_shake_context;
+
+/**
+ * \brief SHAKE context initialization.
+ *
+ * The context is initialized for the provided "security level".
+ * Internally, this sets the "capacity" to twice the security level;
+ * thus, for SHAKE128, the `security_level` parameter should be 128,
+ * which corresponds to a 256-bit capacity.
+ *
+ * Allowed security levels are all multiples of 32, from 32 to 768,
+ * inclusive. Larger security levels imply lower performance; levels
+ * beyond 256 bits don't make much sense. Standard levels are 128
+ * and 256 bits (for SHAKE128 and SHAKE256, respectively).
+ *
+ * \param sc               SHAKE context to initialise.
+ * \param security_level   security level (in bits).
+ */
+void br_shake_init(br_shake_context *sc, int security_level);
+
+/**
+ * \brief SHAKE input injection.
+ *
+ * This function injects some more input bytes ("key material") into
+ * SHAKE. This function may be called several times, after `br_shake_init()`
+ * but before `br_shake_flip()`.
+ *
+ * \param sc     SHAKE context.
+ * \param data   extra input bytes.
+ * \param len    number of extra input bytes.
+ */
+void br_shake_inject(br_shake_context *sc, const void *data, size_t len);
+
+/**
+ * \brief SHAKE switch to production phase.
+ *
+ * This call terminates the input injection process, and starts the
+ * output production process.
+ *
+ * \param sc   SHAKE context.
+ */
+void br_shake_flip(br_shake_context *hc);
+
+/**
+ * \brief SHAKE output production.
+ *
+ * Produce more output bytes from the current state. This function may be
+ * called several times, but only after `br_shake_flip()`.
+ *
+ * There is no practical limit to the number of bytes that may be produced.
+ *
+ * \param sc    SHAKE context.
+ * \param out   destination buffer for the SHAKE output.
+ * \param len   the length of the requested output (in bytes).
+ */
+void br_shake_produce(br_shake_context *sc, void *out, size_t len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/bearssl_rsa.h b/src/bearssl_rsa.h
index 0eaf2a2..0a069fd 100644
--- a/src/bearssl_rsa.h
+++ b/src/bearssl_rsa.h
@@ -28,6 +28,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "bearssl_hash.h"
 #include "bearssl_rand.h"
 
 #ifdef __cplusplus
@@ -279,6 +280,55 @@ typedef uint32_t (*br_rsa_pkcs1_vrfy)(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);
 
+/**
+ * \brief Type for a RSA signature verification engine (PSS).
+ *
+ * Parameters are:
+ *
+ *   - The signature itself. The provided array is NOT modified.
+ *
+ *   - The hash function which was used to hash the message.
+ *
+ *   - The hash function to use with MGF1 within the PSS padding. This
+ *     is not necessarily the same hash function as the one which was
+ *     used to hash the signed message.
+ *
+ *   - The hashed message (as an array of bytes).
+ *
+ *   - The PSS salt length (in bytes).
+ *
+ *   - The public key.
+ *
+ * **Constraints:**
+ *
+ *   - Hash message length MUST be no more than 64 bytes.
+ *
+ * Note that, contrary to PKCS#1 v1.5 signature, the hash value of the
+ * signed data cannot be extracted from the signature; it must be
+ * provided to the verification function.
+ *
+ * This function verifies that the signature length (`xlen`) matches the
+ * modulus length (this function returns 0 on mismatch). If the modulus
+ * size exceeds the maximum supported RSA size, then the function also
+ * returns 0.
+ *
+ * Returned value is 1 on success, 0 on error.
+ *
+ * Implementations of this type need not be constant-time.
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+typedef uint32_t (*br_rsa_pss_vrfy)(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
  * \brief Type for a RSA encryption engine (OAEP).
  *
@@ -385,6 +435,53 @@ typedef uint32_t (*br_rsa_pkcs1_sign)(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);
 
+/**
+ * \brief Type for a RSA signature generation engine (PSS).
+ *
+ * Parameters are:
+ *
+ *   - An initialized PRNG for salt generation. If the salt length is
+ *     zero (`salt_len` parameter), then the PRNG is optional (this is
+ *     not the typical case, as the security proof of RSA/PSS is
+ *     tighter when a non-empty salt is used).
+ *
+ *   - The hash function which was used to hash the message.
+ *
+ *   - The hash function to use with MGF1 within the PSS padding. This
+ *     is not necessarily the same function as the one used to hash the
+ *     message.
+ *
+ *   - The hashed message.
+ *
+ *   - The salt length, in bytes.
+ *
+ *   - The RSA private key.
+ *
+ *   - The output buffer, that receives the signature.
+ *
+ * Returned value is 1 on success, 0 on error. Error conditions include
+ * a too small modulus for the provided hash and salt lengths, or some
+ * invalid key parameters. The signature length is exactly
+ * `(sk->n_bitlen+7)/8` bytes.
+ *
+ * This function is expected to be constant-time with regards to the
+ * private key bytes (lengths of the modulus and the individual factors
+ * may leak, though) and to the hashed data.
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+typedef uint32_t (*br_rsa_pss_sign)(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /**
  * \brief Encoded OID for SHA-1 (in RSA PKCS#1 signatures).
  */
@@ -476,7 +573,7 @@ uint32_t br_rsa_i32_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);
 
 /**
- * \brief RSA signature verification engine "i32".
+ * \brief RSA signature verification engine "i32" (PKCS#1 v1.5 signatures).
  *
  * \see br_rsa_pkcs1_vrfy
  *
@@ -492,6 +589,24 @@ uint32_t br_rsa_i32_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);
 
+/**
+ * \brief RSA signature verification engine "i32" (PSS signatures).
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i32_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
  * \brief RSA private key engine "i32".
  *
@@ -505,7 +620,7 @@ uint32_t br_rsa_i32_private(unsigned char *x,
 	const br_rsa_private_key *sk);
 
 /**
- * \brief RSA signature generation engine "i32".
+ * \brief RSA signature generation engine "i32" (PKCS#1 v1.5 signatures).
  *
  * \see br_rsa_pkcs1_sign
  *
@@ -520,6 +635,25 @@ uint32_t br_rsa_i32_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);
 
+/**
+ * \brief RSA signature generation engine "i32" (PSS signatures).
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i32_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /*
  * RSA "i31" engine. Similar to i32, but only 31 bits are used per 32-bit
  * word. This uses slightly more stack space (about 4% more) and code
@@ -540,7 +674,7 @@ uint32_t br_rsa_i31_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);
 
 /**
- * \brief RSA signature verification engine "i31".
+ * \brief RSA signature verification engine "i31" (PKCS#1 v1.5 signatures).
  *
  * \see br_rsa_pkcs1_vrfy
  *
@@ -556,6 +690,24 @@ uint32_t br_rsa_i31_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);
 
+/**
+ * \brief RSA signature verification engine "i31" (PSS signatures).
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i31_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
  * \brief RSA private key engine "i31".
  *
@@ -569,7 +721,7 @@ uint32_t br_rsa_i31_private(unsigned char *x,
 	const br_rsa_private_key *sk);
 
 /**
- * \brief RSA signature generation engine "i31".
+ * \brief RSA signature generation engine "i31" (PKCS#1 v1.5 signatures).
  *
  * \see br_rsa_pkcs1_sign
  *
@@ -584,6 +736,25 @@ uint32_t br_rsa_i31_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);
 
+/**
+ * \brief RSA signature generation engine "i31" (PSS signatures).
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i31_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /*
  * RSA "i62" engine. Similar to i31, but internal multiplication use
  * 64x64->128 multiplications. This is available only on architecture
@@ -608,7 +779,7 @@ uint32_t br_rsa_i62_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);
 
 /**
- * \brief RSA signature verification engine "i62".
+ * \brief RSA signature verification engine "i62" (PKCS#1 v1.5 signatures).
  *
  * This function is defined only on architecture that offer a 64x64->128
  * opcode. Use `br_rsa_i62_pkcs1_vrfy_get()` to dynamically obtain a pointer
@@ -628,6 +799,28 @@ uint32_t br_rsa_i62_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);
 
+/**
+ * \brief RSA signature verification engine "i62" (PSS signatures).
+ *
+ * This function is defined only on architecture that offer a 64x64->128
+ * opcode. Use `br_rsa_i62_pss_vrfy_get()` to dynamically obtain a pointer
+ * to that function.
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i62_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
  * \brief RSA private key engine "i62".
  *
@@ -645,7 +838,7 @@ uint32_t br_rsa_i62_private(unsigned char *x,
 	const br_rsa_private_key *sk);
 
 /**
- * \brief RSA signature generation engine "i62".
+ * \brief RSA signature generation engine "i62" (PKCS#1 v1.5 signatures).
  *
  * This function is defined only on architecture that offer a 64x64->128
  * opcode. Use `br_rsa_i62_pkcs1_sign_get()` to dynamically obtain a pointer
@@ -664,6 +857,29 @@ uint32_t br_rsa_i62_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);
 
+/**
+ * \brief RSA signature generation engine "i62" (PSS signatures).
+ *
+ * This function is defined only on architecture that offer a 64x64->128
+ * opcode. Use `br_rsa_i62_pss_sign_get()` to dynamically obtain a pointer
+ * to that function.
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i62_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /**
  * \brief Get the RSA "i62" implementation (public key operations),
  * if available.
@@ -673,13 +889,21 @@ uint32_t br_rsa_i62_pkcs1_sign(const unsigned char *hash_oid,
 br_rsa_public br_rsa_i62_public_get(void);
 
 /**
- * \brief Get the RSA "i62" implementation (PKCS#1 signature verification),
+ * \brief Get the RSA "i62" implementation (PKCS#1 v1.5 signature verification),
  * if available.
  *
  * \return  the implementation, or 0.
  */
 br_rsa_pkcs1_vrfy br_rsa_i62_pkcs1_vrfy_get(void);
 
+/**
+ * \brief Get the RSA "i62" implementation (PSS signature verification),
+ * if available.
+ *
+ * \return  the implementation, or 0.
+ */
+br_rsa_pss_vrfy br_rsa_i62_pss_vrfy_get(void);
+
 /**
  * \brief Get the RSA "i62" implementation (private key operations),
  * if available.
@@ -689,13 +913,21 @@ br_rsa_pkcs1_vrfy br_rsa_i62_pkcs1_vrfy_get(void);
 br_rsa_private br_rsa_i62_private_get(void);
 
 /**
- * \brief Get the RSA "i62" implementation (PKCS#1 signature generation),
+ * \brief Get the RSA "i62" implementation (PKCS#1 v1.5 signature generation),
  * if available.
  *
  * \return  the implementation, or 0.
  */
 br_rsa_pkcs1_sign br_rsa_i62_pkcs1_sign_get(void);
 
+/**
+ * \brief Get the RSA "i62" implementation (PSS signature generation),
+ * if available.
+ *
+ * \return  the implementation, or 0.
+ */
+br_rsa_pss_sign br_rsa_i62_pss_sign_get(void);
+
 /**
  * \brief Get the RSA "i62" implementation (OAEP encryption),
  * if available.
@@ -732,7 +964,7 @@ uint32_t br_rsa_i15_public(unsigned char *x, size_t xlen,
 	const br_rsa_public_key *pk);
 
 /**
- * \brief RSA signature verification engine "i15".
+ * \brief RSA signature verification engine "i15" (PKCS#1 v1.5 signatures).
  *
  * \see br_rsa_pkcs1_vrfy
  *
@@ -748,6 +980,24 @@ uint32_t br_rsa_i15_pkcs1_vrfy(const unsigned char *x, size_t xlen,
 	const unsigned char *hash_oid, size_t hash_len,
 	const br_rsa_public_key *pk, unsigned char *hash_out);
 
+/**
+ * \brief RSA signature verification engine "i15" (PSS signatures).
+ *
+ * \see br_rsa_pss_vrfy
+ *
+ * \param x          signature buffer.
+ * \param xlen       signature length (in bytes).
+ * \param hf_data    hash function applied on the message.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hash value of the signed message.
+ * \param salt_len   PSS salt length (in bytes).
+ * \param pk         RSA public key.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i15_pss_vrfy(const unsigned char *x, size_t xlen,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1, 
+	const void *hash, size_t salt_len, const br_rsa_public_key *pk);
+
 /**
  * \brief RSA private key engine "i15".
  *
@@ -761,7 +1011,7 @@ uint32_t br_rsa_i15_private(unsigned char *x,
 	const br_rsa_private_key *sk);
 
 /**
- * \brief RSA signature generation engine "i15".
+ * \brief RSA signature generation engine "i15" (PKCS#1 v1.5 signatures).
  *
  * \see br_rsa_pkcs1_sign
  *
@@ -776,6 +1026,25 @@ uint32_t br_rsa_i15_pkcs1_sign(const unsigned char *hash_oid,
 	const unsigned char *hash, size_t hash_len,
 	const br_rsa_private_key *sk, unsigned char *x);
 
+/**
+ * \brief RSA signature generation engine "i15" (PSS signatures).
+ *
+ * \see br_rsa_pss_sign
+ *
+ * \param rng        PRNG for salt generation (`NULL` if `salt_len` is zero).
+ * \param hf_data    hash function used to hash the signed data.
+ * \param hf_mgf1    hash function to use with MGF1.
+ * \param hash       hashed message.
+ * \param salt_len   salt length (in bytes).
+ * \param sk         RSA private key.
+ * \param x          output buffer for the signature value.
+ * \return  1 on success, 0 on error.
+ */
+uint32_t br_rsa_i15_pss_sign(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash_value, size_t salt_len,
+	const br_rsa_private_key *sk, unsigned char *x);
+
 /**
  * \brief Get "default" RSA implementation (public-key operations).
  *
@@ -797,7 +1066,7 @@ br_rsa_public br_rsa_public_get_default(void);
 br_rsa_private br_rsa_private_get_default(void);
 
 /**
- * \brief Get "default" RSA implementation (PKCS#1 signature verification).
+ * \brief Get "default" RSA implementation (PKCS#1 v1.5 signature verification).
  *
  * This returns the preferred implementation of RSA (signature verification)
  * on the current system.
@@ -807,7 +1076,17 @@ br_rsa_private br_rsa_private_get_default(void);
 br_rsa_pkcs1_vrfy br_rsa_pkcs1_vrfy_get_default(void);
 
 /**
- * \brief Get "default" RSA implementation (PKCS#1 signature generation).
+ * \brief Get "default" RSA implementation (PSS signature verification).
+ *
+ * This returns the preferred implementation of RSA (signature verification)
+ * on the current system.
+ *
+ * \return  the default implementation.
+ */
+br_rsa_pss_vrfy br_rsa_pss_vrfy_get_default(void);
+
+/**
+ * \brief Get "default" RSA implementation (PKCS#1 v1.5 signature generation).
  *
  * This returns the preferred implementation of RSA (signature generation)
  * on the current system.
@@ -816,6 +1095,16 @@ br_rsa_pkcs1_vrfy br_rsa_pkcs1_vrfy_get_default(void);
  */
 br_rsa_pkcs1_sign br_rsa_pkcs1_sign_get_default(void);
 
+/**
+ * \brief Get "default" RSA implementation (PSS signature generation).
+ *
+ * This returns the preferred implementation of RSA (signature generation)
+ * on the current system.
+ *
+ * \return  the default implementation.
+ */
+br_rsa_pss_sign br_rsa_pss_sign_get_default(void);
+
 /**
  * \brief Get "default" RSA implementation (OAEP encryption).
  *
diff --git a/src/bearssl_ssl.h b/src/bearssl_ssl.h
index d28405a..5dc2228 100644
--- a/src/bearssl_ssl.h
+++ b/src/bearssl_ssl.h
@@ -1250,8 +1250,8 @@ static inline void
 br_ssl_engine_set_versions(br_ssl_engine_context *cc,
 	unsigned version_min, unsigned version_max)
 {
-	cc->version_min = version_min;
-	cc->version_max = version_max;
+	cc->version_min = (uint16_t)version_min;
+	cc->version_max = (uint16_t)version_max;
 }
 
 /**
@@ -1324,7 +1324,7 @@ br_ssl_engine_set_protocol_names(br_ssl_engine_context *ctx,
 	const char **names, size_t num)
 {
 	ctx->protocol_names = names;
-	ctx->protocol_names_num = num;
+	ctx->protocol_names_num = (uint16_t)num;
 }
 
 /**
@@ -2102,7 +2102,7 @@ void br_ssl_engine_sendapp_ack(br_ssl_engine_context *cc, size_t len);
 /**
  * \brief Get buffer for received application data.
  *
- * If the engine has received application data from the peer, hen this
+ * If the engine has received application data from the peer, then this
  * call returns a pointer to the buffer from where such data shall be
  * read, and its length is written in `*len`. Otherwise, `*len` is set
  * to 0 and `NULL` is returned.
@@ -4154,20 +4154,6 @@ int br_sslio_flush(br_sslio_context *cc);
  */
 int br_sslio_close(br_sslio_context *cc);
 
-/*
- * Run the engine, until the specified target state is achieved, or
- * an error occurs. The target state is SENDAPP, RECVAPP, or the
- * combination of both (the combination matches either). When a match is
- * achieved, this function returns 0. On error, it returns -1.
- * 
- * Static function made public since we would like to be able to
- * initialize the ssl socket in a single function
- * 
- * \return  0 on success, or -1 on error.
- */
-int
-br_run_until(br_sslio_context *ctx, unsigned target);
-
 /* ===================================================================== */
 
 /*
diff --git a/src/config.h b/src/config.h
index d7c2e19..d07408a 100644
--- a/src/config.h
+++ b/src/config.h
@@ -108,9 +108,27 @@
 #define BR_RDRAND   1
  */
 
+/*
+ * When BR_USE_GETENTROPY is enabled, the SSL engine will use the
+ * getentropy() function to obtain quality randomness for seeding its
+ * internal PRNG. On Linux and FreeBSD, getentropy() is implemented by
+ * the standard library with the system call getrandom(); on OpenBSD,
+ * getentropy() is the system call, and there is no getrandom() wrapper,
+ * hence the use of the getentropy() function for maximum portability.
+ *
+ * If the getentropy() call fails, and BR_USE_URANDOM is not explicitly
+ * disabled, then /dev/urandom will be used as a fallback mechanism. On
+ * FreeBSD and OpenBSD, this does not change much, since /dev/urandom
+ * will block if not enough entropy has been obtained since last boot.
+ * On Linux, /dev/urandom might not block, which can be troublesome in
+ * early boot stages, which is why getentropy() is preferred.
+ *
+#define BR_USE_GETENTROPY   1
+ */
+
 /*
  * When BR_USE_URANDOM is enabled, the SSL engine will use /dev/urandom
- * to automatically obtain quality randomness for seedings its internal
+ * to automatically obtain quality randomness for seeding its internal
  * PRNG.
  *
 #define BR_USE_URANDOM   1
@@ -119,7 +137,7 @@
 /*
  * When BR_USE_WIN32_RAND is enabled, the SSL engine will use the Win32
  * (CryptoAPI) functions (CryptAcquireContext(), CryptGenRandom()...) to
- * automatically obtain quality randomness for seedings its internal PRNG.
+ * automatically obtain quality randomness for seeding its internal PRNG.
  *
  * Note: if both BR_USE_URANDOM and BR_USE_WIN32_RAND are defined, the
  * former takes precedence.
@@ -132,10 +150,10 @@
  * the current time from the OS by calling time(), and assuming that the
  * returned value (a 'time_t') is an integer that counts time in seconds
  * since the Unix Epoch (Jan 1st, 1970, 00:00 UTC).
- *
  */
 #define BR_USE_UNIX_TIME   0
 
+
 /*
  * When BR_USE_WIN32_TIME is enabled, the X.509 validation engine obtains
  * the current time from the OS by calling the Win32 function
@@ -143,8 +161,9 @@
  *
  * Note: if both BR_USE_UNIX_TIME and BR_USE_WIN32_TIME are defined, the
  * former takes precedence.
+ *
+#define BR_USE_WIN32_TIME   1
  */
-#define BR_USE_WIN32_TIME   0
 
 /*
  * When BR_ARMEL_CORTEXM_GCC is enabled, some operations are replaced with
@@ -158,9 +177,7 @@
  * Note: if BR_LOMUL is not explicitly enabled or disabled, then
  * enabling BR_ARMEL_CORTEXM_GCC also enables BR_LOMUL.
  */
-#ifdef ARDUINO_ARCH_SAMD
 #define BR_ARMEL_CORTEXM_GCC   1
-#endif
 
 /*
  * When BR_AES_X86NI is enabled, the AES implementation using the x86 "NI"
diff --git a/src/inner.h b/src/inner.h
index 8c7f04e..07e1d0a 100644
--- a/src/inner.h
+++ b/src/inner.h
@@ -114,6 +114,10 @@
 #define BR_64   1
 #elif defined(__x86_64__) || defined(_M_X64)
 #define BR_64   1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define BR_64   1
+#elif defined(__mips64)
+#define BR_64   1
 #endif
 #endif
 
@@ -305,9 +309,20 @@
  * values are documented on:
  *    https://sourceforge.net/p/predef/wiki/OperatingSystems/
  *
- * TODO: enrich the list of detected system. Also add detection for
- * alternate system calls like getentropy(), which are usually
- * preferable when available.
+ * Win32's CryptGenRandom() should be available on Windows systems.
+ *
+ * /dev/urandom should work on all Unix-like systems (including macOS X).
+ *
+ * getentropy() is present on Linux (Glibc 2.25+), FreeBSD (12.0+) and
+ * OpenBSD (5.6+). For OpenBSD, there does not seem to be easy to use
+ * macros to test the minimum version, so we just assume that it is
+ * recent enough (last version without getentropy() has gone out of
+ * support in May 2015).
+ *
+ * Ideally we should use getentropy() on macOS (10.12+) too, but I don't
+ * know how to test the exact OS version with preprocessor macros.
+ *
+ * TODO: enrich the list of detected system.
  */
 
 #ifndef BR_USE_URANDOM
@@ -324,6 +339,15 @@
 #endif
 #endif
 
+#ifndef BR_USE_GETENTROPY
+#if (defined __linux__ \
+	&& (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
+	|| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
+	|| defined __OpenBSD__
+#define BR_USE_GETENTROPY   1
+#endif
+#endif
+
 #ifndef BR_USE_WIN32_RAND
 #if defined _WIN32 || defined _WIN64
 #define BR_USE_WIN32_RAND   1
@@ -1943,6 +1967,27 @@ uint32_t br_rsa_pkcs1_sig_unpad(const unsigned char *sig, size_t sig_len,
 	const unsigned char *hash_oid, size_t hash_len,
 	unsigned char *hash_out);
 
+/*
+ * Apply proper PSS padding. The 'x' buffer is output only: it
+ * receives the value that is to be exponentiated.
+ */
+uint32_t br_rsa_pss_sig_pad(const br_prng_class **rng,
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	uint32_t n_bitlen, unsigned char *x);
+
+/*
+ * Check PSS padding. The provided value is the one _after_
+ * the modular exponentiation; it is modified by this function.
+ * This function infers the signature length from the public key
+ * size, i.e. it assumes that this has already been verified (as
+ * part of the exponentiation).
+ */
+uint32_t br_rsa_pss_sig_unpad(
+	const br_hash_class *hf_data, const br_hash_class *hf_mgf1,
+	const unsigned char *hash, size_t salt_len,
+	const br_rsa_public_key *pk, unsigned char *x);
+
 /*
  * Apply OAEP padding. Returned value is the actual padded string length,
  * or zero on error.
@@ -2448,8 +2493,8 @@ int br_ssl_choose_hash(unsigned bf);
 #else
 #define BR_TARGETS_X86_UP \
 	_Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
-#endif
 #define BR_TARGETS_X86_DOWN
+#endif
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif