Skip to content

Commit 9116c4c

Browse files
committed
[PEM] Significantly improve performance by leveraging indexOf and operations on ByteArrays instead of eager splitting in lines
- JVM: 1.5-4 times faster - JS: 1.2-11 times faster - Native: 1.2-13 times faster - WasmJS: 1.1-6 times faster For non-jvm targets, the smallest improvement is when working with strings
1 parent d806d3c commit 9116c4c

File tree

1 file changed

+243
-68
lines changed

1 file changed

+243
-68
lines changed

cryptography-serialization/pem/src/commonMain/kotlin/PemDocument.kt

Lines changed: 243 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package dev.whyoleg.cryptography.serialization.pem
66

77
import kotlinx.io.*
88
import kotlinx.io.bytestring.*
9+
import kotlinx.io.bytestring.unsafe.*
910
import kotlin.io.encoding.*
1011

1112
public class PemDocument(
@@ -17,27 +18,14 @@ public class PemDocument(
1718
content: ByteArray,
1819
) : this(label, ByteString(content))
1920

20-
public fun encodeToString(): String = buildString {
21-
encodedLines().forEach(::appendLine)
22-
}
21+
public fun encodeToString(): String = encodeToByteArrayImpl().decodeToString()
2322

24-
public fun encodeToByteArray(): ByteArray = encodeToString().encodeToByteArray()
25-
public fun encodeToByteString(): ByteString = encodeToString().encodeToByteString()
23+
public fun encodeToByteArray(): ByteArray = encodeToByteArrayImpl()
2624

27-
public fun encodeToSink(sink: Sink) {
28-
encodedLines().forEach { line ->
29-
sink.writeString(line)
30-
sink.writeCodePointValue('\n'.code)
31-
}
32-
}
25+
@OptIn(UnsafeByteStringApi::class)
26+
public fun encodeToByteString(): ByteString = UnsafeByteStringOperations.wrapUnsafe(encodeToByteArrayImpl())
3327

34-
// TODO: let's change implementation to use Base64.encodeToByteArray for Sink - there is no need to go through String
35-
// same for encodeToByteString/encodeToByteArray
36-
private fun encodedLines(): Sequence<String> = sequence {
37-
yield(BEGIN_PREFIX + label.value + SUFFIX)
38-
yieldAll(Base64.encode(content).chunkedSequence(64))
39-
yield(END_PREFIX + label.value + SUFFIX)
40-
}
28+
public fun encodeToSink(sink: Sink): Unit = sink.write(encodeToByteArrayImpl())
4129

4230
override fun equals(other: Any?): Boolean {
4331
if (this === other) return true
@@ -60,71 +48,258 @@ public class PemDocument(
6048
}
6149

6250
public companion object {
63-
private const val BEGIN_PREFIX = "-----BEGIN "
64-
private const val END_PREFIX = "-----END "
65-
private const val SUFFIX = "-----"
66-
6751
// decode will skip comments and everything else which is not label or content
6852

6953
// will decode only the first one, even if there is something else after it
70-
public fun decode(text: String): PemDocument = decode(text.lineSequence())
71-
public fun decode(bytes: ByteArray): PemDocument = decode(bytes.decodeToString().lineSequence())
72-
public fun decode(bytes: ByteString): PemDocument = decode(bytes.decodeToString().lineSequence())
73-
public fun decode(source: Source): PemDocument = decode(generateSequence(source::readLine))
54+
public fun decode(text: String): PemDocument {
55+
return tryDecodeFromString(text, startIndex = 0, saveEndIndex = {}) ?: error("Invalid PEM format: missing BEGIN label")
56+
}
57+
58+
public fun decodeToSequence(text: String): Sequence<PemDocument> = sequence {
59+
var startIndex = 0
60+
while (startIndex < text.length) {
61+
yield(tryDecodeFromString(text, startIndex) { startIndex = it } ?: break)
62+
}
63+
if (startIndex == 0) error("Invalid PEM format: missing BEGIN label")
64+
}
65+
66+
@OptIn(UnsafeByteStringApi::class)
67+
public fun decode(bytes: ByteArray): PemDocument {
68+
return decode(UnsafeByteStringOperations.wrapUnsafe(bytes))
69+
}
7470

75-
public fun decodeToSequence(text: String): Sequence<PemDocument> = decodeToSequence(text.lineSequence())
76-
public fun decodeToSequence(bytes: ByteArray): Sequence<PemDocument> = decodeToSequence(bytes.decodeToString().lineSequence())
77-
public fun decodeToSequence(bytes: ByteString): Sequence<PemDocument> = decodeToSequence(bytes.decodeToString().lineSequence())
78-
public fun decodeToSequence(source: Source): Sequence<PemDocument> = decodeToSequence(generateSequence(source::readLine))
71+
@OptIn(UnsafeByteStringApi::class)
72+
public fun decodeToSequence(bytes: ByteArray): Sequence<PemDocument> {
73+
return decodeToSequence(UnsafeByteStringOperations.wrapUnsafe(bytes))
74+
}
75+
76+
public fun decode(bytes: ByteString): PemDocument {
77+
return tryDecodeFromByteString(bytes, startIndex = 0, saveEndIndex = {}) ?: error("Invalid PEM format: missing BEGIN label")
78+
}
7979

80-
// implementation
80+
public fun decodeToSequence(bytes: ByteString): Sequence<PemDocument> = sequence {
81+
var startIndex = 0
82+
while (startIndex < bytes.size) {
83+
yield(tryDecodeFromByteString(bytes, startIndex) { startIndex = it } ?: break)
84+
}
85+
if (startIndex == 0) error("Invalid PEM format: missing BEGIN label")
86+
}
8187

82-
// it will never be empty
83-
private fun decode(lines: Sequence<String>): PemDocument = decodeToSequence(lines).first()
88+
public fun decode(source: Source): PemDocument {
89+
return tryDecodeFromSource(source) ?: error("Invalid PEM format: missing BEGIN label")
90+
}
8491

85-
// it will never be empty, or will throw an error - TBD
86-
private fun decodeToSequence(lines: Sequence<String>): Sequence<PemDocument> = sequence {
92+
public fun decodeToSequence(source: Source): Sequence<PemDocument> = sequence {
8793
var hasAtLeastOneBeginLabel = false
88-
var beginLabel: String? = null
89-
val content = StringBuilder()
90-
91-
for (line in lines) {
92-
if (beginLabel == null) {
93-
beginLabel = line.findLabel(BEGIN_PREFIX, "BEGIN") ?: continue
94-
hasAtLeastOneBeginLabel = true
95-
} else {
96-
val endLabel = line.findLabel(END_PREFIX, "END") ?: run {
97-
content.append(line)
98-
continue
99-
}
100-
check(beginLabel == endLabel) { "Invalid PEM format: BEGIN=`$beginLabel`, END=`$endLabel`" }
101-
102-
val document = PemDocument(
103-
label = PemLabel(beginLabel),
104-
content = Base64.decodeToByteString(content.toString())
105-
)
106-
content.clear()
107-
beginLabel = null
108-
109-
yield(document)
110-
}
94+
while (!source.exhausted()) {
95+
yield(tryDecodeFromSource(source) ?: break)
96+
hasAtLeastOneBeginLabel = true
11197
}
98+
if (!hasAtLeastOneBeginLabel) error("Invalid PEM format: missing BEGIN label")
99+
}
100+
}
101+
}
102+
103+
private const val NEW_LINE = '\n'
104+
private const val BEGIN_PREFIX = "-----BEGIN "
105+
private const val END_PREFIX = "-----END "
106+
private const val SUFFIX = "-----"
107+
108+
private const val NEW_LINE_BYTE = NEW_LINE.code.toByte()
109+
private val BEGIN_BYTES = BEGIN_PREFIX.encodeToByteArray()
110+
private val END_BYTES = END_PREFIX.encodeToByteArray()
111+
private val SUFFIX_BYTES = SUFFIX.encodeToByteArray()
112+
113+
// Overall, the performance significantly depends on the target,
114+
// some targets (wasmJs) may work with byte arrays faster, than with strings
115+
// f.e tryDecodeFromByteString(text.encodeToByteArray) is faster than tryDecodeFromString(text) by 50%
116+
// but hopefully it will be improved in the future
117+
// on JVM, operations on byte arrays are always faster :)
118+
119+
// 1.5 times faster than naive encodeToString()
120+
// 2 times faster than naive encodeToString().encodeToByteArray()
121+
// naive encodeToString impl:
122+
// return buildString {
123+
// append(BEGIN_PREFIX).append(label.value).appendLine(SUFFIX)
124+
// Base64.Pem.encodeToAppendable(content, this).appendLine()
125+
// append(END_PREFIX).append(label.value).appendLine(SUFFIX)
126+
// }
127+
private fun PemDocument.encodeToByteArrayImpl(): ByteArray {
128+
// based on kotlin.Base64 implementation
129+
fun base64EncodedSize(sourceSize: Int): Int {
130+
val groups = sourceSize / 3 //bytesPerGroup
131+
val trailingBytes = sourceSize % 3 // bytesPerGroup
132+
var size = groups * 4 // symbolsPerGroup
133+
if (trailingBytes != 0) { // trailing symbols
134+
size += 4
135+
}
136+
if (size < 0) { // Int overflow
137+
throw IllegalArgumentException("Input is too big")
138+
}
139+
size += ((size - 1) / 64) * 2
140+
if (size < 0) { // Int overflow
141+
throw IllegalArgumentException("Input is too big")
142+
}
143+
return size
144+
}
112145

113-
check(hasAtLeastOneBeginLabel) { "Invalid PEM format: missing BEGIN label" }
114-
check(beginLabel == null) { "Invalid PEM format: missing END label" }
146+
val label = label.value.encodeToByteArray()
147+
val encodedSize = base64EncodedSize(content.size)
148+
149+
val array = ByteArray(
150+
BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size + 1 +
151+
encodedSize + 1 +
152+
END_BYTES.size + label.size + SUFFIX_BYTES.size + 1
153+
)
154+
155+
// encode `-----BEGIN LABEL-----\n`
156+
BEGIN_BYTES.copyInto(array)
157+
label.copyInto(array, BEGIN_BYTES.size)
158+
SUFFIX_BYTES.copyInto(array, BEGIN_BYTES.size + label.size)
159+
array[BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size] = NEW_LINE_BYTE
160+
161+
// encode `base64\n`
162+
Base64.Pem.encodeIntoByteArray(content, array, BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size + 1)
163+
array[BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size + 1 + encodedSize] = NEW_LINE_BYTE
164+
165+
// encode `-----END LABEL-----\n`
166+
END_BYTES.copyInto(array, BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size + 1 + encodedSize + 1)
167+
label.copyInto(array, BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size + 1 + encodedSize + 1 + END_BYTES.size)
168+
SUFFIX_BYTES.copyInto(array, BEGIN_BYTES.size + label.size + SUFFIX_BYTES.size + 1 + encodedSize + 1 + END_BYTES.size + label.size)
169+
array[array.lastIndex] = NEW_LINE_BYTE
170+
171+
return array
172+
}
173+
174+
// 1.5 times faster than using lineSequence()
175+
private inline fun tryDecodeFromString(
176+
text: String,
177+
startIndex: Int,
178+
saveEndIndex: (endIndex: Int) -> Unit,
179+
): PemDocument? {
180+
val beginIndex = text.indexOf(BEGIN_PREFIX, startIndex)
181+
if (beginIndex == -1) return null
182+
val beginLineEndIndex = text.indexOf(NEW_LINE, beginIndex + BEGIN_PREFIX.length)
183+
if (beginLineEndIndex == -1) error("Invalid PEM format: missing new line after BEGIN label")
184+
val beginSuffixIndex = text.indexOf(SUFFIX, beginIndex + BEGIN_PREFIX.length)
185+
if (beginSuffixIndex == -1 || beginSuffixIndex > beginLineEndIndex) error("Invalid PEM format: missing BEGIN label suffix")
186+
187+
val beginLabel = text.substring(beginIndex + BEGIN_PREFIX.length, beginSuffixIndex)
188+
189+
val endIndex = text.indexOf(END_PREFIX, beginLineEndIndex)
190+
if (endIndex == -1) error("Invalid PEM format: missing END label")
191+
val endLineEndIndex = text.indexOf(NEW_LINE, endIndex + END_PREFIX.length)
192+
val endSuffixIndex = text.indexOf(SUFFIX, endIndex + END_PREFIX.length)
193+
if (endSuffixIndex == -1 || (endLineEndIndex != -1 && endSuffixIndex > endLineEndIndex)) error("Invalid PEM format: missing END label suffix")
194+
195+
val endLabel = text.substring(endIndex + END_PREFIX.length, endSuffixIndex)
196+
if (endLabel != beginLabel) error("Invalid PEM format: BEGIN=`$beginLabel`, END=`$endLabel`")
197+
198+
saveEndIndex(
199+
if (endLineEndIndex == -1) {
200+
endSuffixIndex + SUFFIX.length
201+
} else {
202+
endLineEndIndex + 1
115203
}
204+
)
205+
return PemDocument(
206+
label = PemLabel(beginLabel),
207+
content = Base64.Pem.decodeToByteString(
208+
source = text,
209+
startIndex = beginLineEndIndex + 1, // 1 because of new line
210+
endIndex = endIndex
211+
)
212+
)
213+
}
214+
215+
// 1.5 times faster than decode(bytes.decodeToString())
216+
// 2 times faster than using lineSequence()
217+
private inline fun tryDecodeFromByteString(
218+
bytes: ByteString,
219+
startIndex: Int,
220+
saveEndIndex: (endIndex: Int) -> Unit,
221+
): PemDocument? {
222+
val beginIndex = bytes.indexOf(BEGIN_BYTES, startIndex)
223+
if (beginIndex == -1) return null
224+
val beginLineEndIndex = bytes.indexOf(NEW_LINE_BYTE, beginIndex + BEGIN_BYTES.size)
225+
if (beginLineEndIndex == -1) error("Invalid PEM format: missing new line after BEGIN label")
226+
val beginSuffixIndex = bytes.indexOf(SUFFIX_BYTES, beginIndex + BEGIN_BYTES.size)
227+
if (beginSuffixIndex == -1 || beginSuffixIndex > beginLineEndIndex) error("Invalid PEM format: missing BEGIN label suffix")
228+
229+
val beginLabel = bytes.substring(beginIndex + BEGIN_BYTES.size, beginSuffixIndex)
116230

117-
private fun String.findLabel(prefix: String, type: String): String? {
118-
val startIndex = indexOf(prefix)
119-
if (startIndex == -1) return null
231+
val endIndex = bytes.indexOf(END_BYTES, beginLineEndIndex)
232+
if (endIndex == -1) error("Invalid PEM format: missing END label")
233+
val endLineEndIndex = bytes.indexOf(NEW_LINE_BYTE, endIndex + END_BYTES.size)
234+
val endSuffixIndex = bytes.indexOf(SUFFIX_BYTES, endIndex + END_BYTES.size)
235+
if (endSuffixIndex == -1 || (endLineEndIndex != -1 && endSuffixIndex > endLineEndIndex)) error("Invalid PEM format: missing END label suffix")
120236

121-
val endIndex = lastIndexOf(SUFFIX)
122-
if (endIndex == -1) error("Invalid PEM format: missing suffix")
237+
val endLabel = bytes.substring(endIndex + END_BYTES.size, endSuffixIndex)
238+
if (endLabel != beginLabel) error("Invalid PEM format: BEGIN=`${beginLabel.decodeToString()}`, END=`${endLabel.decodeToString()}`")
123239

124-
val label = substring(startIndex + prefix.length, endIndex)
125-
if (label.isBlank()) error("Invalid PEM format: $type label is empty")
240+
saveEndIndex(
241+
if (endLineEndIndex == -1) {
242+
endSuffixIndex + SUFFIX_BYTES.size
126243

127-
return label
244+
} else {
245+
endLineEndIndex + 1
128246
}
247+
)
248+
249+
return PemDocument(
250+
label = PemLabel(beginLabel.decodeToString()),
251+
content = Base64.Pem.decodeToByteString(
252+
source = bytes,
253+
startIndex = beginLineEndIndex + 1, // 1 because of new line
254+
endIndex = endIndex
255+
)
256+
)
257+
}
258+
259+
// 2 times faster than using lineSequence()
260+
@OptIn(UnsafeByteStringApi::class)
261+
private fun tryDecodeFromSource(source: Source): PemDocument? {
262+
fun Source.indexOf(bytes: ByteArray, startIndex: Long = 0): Long {
263+
return indexOf(UnsafeByteStringOperations.wrapUnsafe(bytes), startIndex)
264+
}
265+
266+
val beginIndex = source.indexOf(BEGIN_BYTES)
267+
if (beginIndex == -1L) {
268+
// we haven't found BEGIN label, but we already read everything - discard it
269+
source.transferTo(discardingSink())
270+
return null
271+
}
272+
source.skip(beginIndex + BEGIN_BYTES.size)
273+
274+
val beginLineEndIndex = source.indexOf(NEW_LINE_BYTE)
275+
if (beginLineEndIndex == -1L) error("Invalid PEM format: missing new line after BEGIN label")
276+
val beginSuffixIndex = source.indexOf(SUFFIX_BYTES)
277+
if (beginSuffixIndex == -1L || beginSuffixIndex > beginLineEndIndex) error("Invalid PEM format: missing BEGIN label suffix")
278+
279+
val beginLabel = source.readByteString(beginSuffixIndex.toInt())
280+
source.skip(beginLineEndIndex + 1 - beginSuffixIndex) // skip suffix & new line
281+
282+
val endIndex = source.indexOf(END_BYTES)
283+
if (endIndex == -1L) error("Invalid PEM format: missing END label")
284+
285+
val base64Content = source.readByteString(endIndex.toInt())
286+
source.skip(END_BYTES.size.toLong())
287+
288+
val endLineEndIndex = source.indexOf(NEW_LINE_BYTE)
289+
val endSuffixIndex = source.indexOf(SUFFIX_BYTES)
290+
if (endSuffixIndex == -1L || (endLineEndIndex != -1L && endSuffixIndex > endLineEndIndex)) error("Invalid PEM format: missing END label suffix")
291+
292+
val endLabel = source.readByteString(endSuffixIndex.toInt())
293+
if (endLineEndIndex == -1L) {
294+
source.skip(SUFFIX_BYTES.size.toLong())
295+
} else {
296+
source.skip(endLineEndIndex + 1 - endSuffixIndex)
129297
}
298+
299+
if (endLabel != beginLabel) error("Invalid PEM format: BEGIN=`${beginLabel.decodeToString()}`, END=`${endLabel.decodeToString()}`")
300+
301+
return PemDocument(
302+
label = PemLabel(beginLabel.decodeToString()),
303+
content = Base64.Pem.decodeToByteString(base64Content)
304+
)
130305
}

0 commit comments

Comments
 (0)