Skip to content

Commit 9962d95

Browse files
FiloSottilegopherbot
authored andcommitted
crypto/internal/fips140/mldsa: unroll NTT and inverseNTT
fips140: off goos: darwin goarch: arm64 pkg: crypto/internal/fips140test cpu: Apple M2 │ bade4ade59 │ bade4ade59-dirty │ │ sec/op │ sec/op vs base │ MLDSASign/ML-DSA-44-8 264.8µ ± 0% 244.5µ ± 0% -7.68% (p=0.000 n=20) fips140: off goos: linux goarch: amd64 pkg: crypto/internal/fips140test cpu: AMD EPYC 7443P 24-Core Processor │ bade4ade59 │ bade4ade59-dirty │ │ sec/op │ sec/op vs base │ MLDSASign/ML-DSA-44-48 408.7µ ± 3% 386.5µ ± 1% -5.41% (p=0.000 n=20) Change-Id: I04d38a48d5105cbcd625cba9398711b26a6a6964 Reviewed-on: https://go-review.googlesource.com/c/go/+/723020 Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Daniel McCarney <daniel@binaryparadox.net> Auto-Submit: Filippo Valsorda <filippo@golang.org> Reviewed-by: Mark Freeman <markfreeman@google.com>
1 parent f821fc4 commit 9962d95

File tree

1 file changed

+104
-4
lines changed
  • src/crypto/internal/fips140/mldsa

1 file changed

+104
-4
lines changed

src/crypto/internal/fips140/mldsa/field.go

Lines changed: 104 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,19 +146,69 @@ var zetas = [256]fieldElement{4193792, 25847, 5771523, 7861508, 237124, 7602457,
146146
// It implements NTT, according to FIPS 203, Algorithm 9.
147147
func ntt(f ringElement) nttElement {
148148
var m uint8
149-
for len := 128; len >= 1; len /= 2 {
149+
150+
for len := 128; len >= 8; len /= 2 {
150151
for start := 0; start < 256; start += 2 * len {
151152
m++
152153
zeta := zetas[m]
154+
153155
// Bounds check elimination hint.
154156
f, flen := f[start:start+len], f[start+len:start+len+len]
155-
for j := 0; j < len; j++ {
157+
for j := 0; j < len; j += 2 {
156158
t := fieldMontgomeryMul(zeta, flen[j])
157159
flen[j] = fieldSub(f[j], t)
158160
f[j] = fieldAdd(f[j], t)
161+
162+
// Unroll by 2 for performance.
163+
t = fieldMontgomeryMul(zeta, flen[j+1])
164+
flen[j+1] = fieldSub(f[j+1], t)
165+
f[j+1] = fieldAdd(f[j+1], t)
159166
}
160167
}
161168
}
169+
170+
// Unroll len = 4, 2, and 1.
171+
for start := 0; start < 256; start += 8 {
172+
m++
173+
zeta := zetas[m]
174+
175+
t := fieldMontgomeryMul(zeta, f[start+4])
176+
f[start+4] = fieldSub(f[start], t)
177+
f[start] = fieldAdd(f[start], t)
178+
179+
t = fieldMontgomeryMul(zeta, f[start+5])
180+
f[start+5] = fieldSub(f[start+1], t)
181+
f[start+1] = fieldAdd(f[start+1], t)
182+
183+
t = fieldMontgomeryMul(zeta, f[start+6])
184+
f[start+6] = fieldSub(f[start+2], t)
185+
f[start+2] = fieldAdd(f[start+2], t)
186+
187+
t = fieldMontgomeryMul(zeta, f[start+7])
188+
f[start+7] = fieldSub(f[start+3], t)
189+
f[start+3] = fieldAdd(f[start+3], t)
190+
}
191+
for start := 0; start < 256; start += 4 {
192+
m++
193+
zeta := zetas[m]
194+
195+
t := fieldMontgomeryMul(zeta, f[start+2])
196+
f[start+2] = fieldSub(f[start], t)
197+
f[start] = fieldAdd(f[start], t)
198+
199+
t = fieldMontgomeryMul(zeta, f[start+3])
200+
f[start+3] = fieldSub(f[start+1], t)
201+
f[start+1] = fieldAdd(f[start+1], t)
202+
}
203+
for start := 0; start < 256; start += 2 {
204+
m++
205+
zeta := zetas[m]
206+
207+
t := fieldMontgomeryMul(zeta, f[start+1])
208+
f[start+1] = fieldSub(f[start], t)
209+
f[start] = fieldAdd(f[start], t)
210+
}
211+
162212
return nttElement(f)
163213
}
164214

@@ -167,20 +217,70 @@ func ntt(f ringElement) nttElement {
167217
// It implements NTT⁻¹, according to FIPS 203, Algorithm 10.
168218
func inverseNTT(f nttElement) ringElement {
169219
var m uint8 = 255
170-
for len := 1; len < 256; len *= 2 {
220+
221+
// Unroll len = 1, 2, and 4.
222+
for start := 0; start < 256; start += 2 {
223+
zeta := zetas[m]
224+
m--
225+
226+
t := f[start]
227+
f[start] = fieldAdd(t, f[start+1])
228+
f[start+1] = fieldMontgomeryMulSub(zeta, f[start+1], t)
229+
}
230+
for start := 0; start < 256; start += 4 {
231+
zeta := zetas[m]
232+
m--
233+
234+
t := f[start]
235+
f[start] = fieldAdd(t, f[start+2])
236+
f[start+2] = fieldMontgomeryMulSub(zeta, f[start+2], t)
237+
238+
t = f[start+1]
239+
f[start+1] = fieldAdd(t, f[start+3])
240+
f[start+3] = fieldMontgomeryMulSub(zeta, f[start+3], t)
241+
}
242+
for start := 0; start < 256; start += 8 {
243+
zeta := zetas[m]
244+
m--
245+
246+
t := f[start]
247+
f[start] = fieldAdd(t, f[start+4])
248+
f[start+4] = fieldMontgomeryMulSub(zeta, f[start+4], t)
249+
250+
t = f[start+1]
251+
f[start+1] = fieldAdd(t, f[start+5])
252+
f[start+5] = fieldMontgomeryMulSub(zeta, f[start+5], t)
253+
254+
t = f[start+2]
255+
f[start+2] = fieldAdd(t, f[start+6])
256+
f[start+6] = fieldMontgomeryMulSub(zeta, f[start+6], t)
257+
258+
t = f[start+3]
259+
f[start+3] = fieldAdd(t, f[start+7])
260+
f[start+7] = fieldMontgomeryMulSub(zeta, f[start+7], t)
261+
}
262+
263+
for len := 8; len < 256; len *= 2 {
171264
for start := 0; start < 256; start += 2 * len {
172265
zeta := zetas[m]
173266
m--
267+
174268
// Bounds check elimination hint.
175269
f, flen := f[start:start+len], f[start+len:start+len+len]
176-
for j := 0; j < len; j++ {
270+
for j := 0; j < len; j += 2 {
177271
t := f[j]
178272
f[j] = fieldAdd(t, flen[j])
179273
// -z * (t - flen[j]) = z * (flen[j] - t)
180274
flen[j] = fieldMontgomeryMulSub(zeta, flen[j], t)
275+
276+
// Unroll by 2 for performance.
277+
t = f[j+1]
278+
f[j+1] = fieldAdd(t, flen[j+1])
279+
flen[j+1] = fieldMontgomeryMulSub(zeta, flen[j+1], t)
181280
}
182281
}
183282
}
283+
184284
for i := range f {
185285
f[i] = fieldMontgomeryMul(f[i], 16382) // 16382 = 256⁻¹ * R mod q
186286
}

0 commit comments

Comments
 (0)