Skip to content

Commit 362cba1

Browse files
committed
tested ASCII generator + untested utf8 generator
1 parent 6f0735a commit 362cba1

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed

test/AsciiTest.cs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
namespace tests;
22
using System.Text;
33

4+
//TODO (Nick Nuon): Test UTF8 Generator works correctly
5+
46
public class AsciiTest
57
{
68
[Fact]
@@ -77,6 +79,7 @@ public void HardCodedSequencesTest()
7779
{
7880
Assert.True(SimdUnicode.Ascii.IsAscii(sequence), "Expected valid ASCII sequence");
7981
Assert.True(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to validate ASCII sequence");
82+
8083
}
8184

8285
foreach (var sequence in badsequences)
@@ -85,4 +88,40 @@ public void HardCodedSequencesTest()
8588
Assert.False(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to invalidate non-ASCII sequence");
8689
}
8790
}
91+
92+
[Fact]
93+
public void Test_random_ASCII_sequences_of_varying_lengths()
94+
{
95+
const int NUM_TRIALS = 1000;
96+
const int MAX_LENGTH = 255;
97+
RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0); // Only ASCII/one-bytes
98+
99+
for (int length = 1; length <= MAX_LENGTH; length++)
100+
{
101+
int validSequencesCount = 0;
102+
103+
for (int i = 0; i < NUM_TRIALS; i++)
104+
{
105+
byte[] sequence = utf8Generator.Generate(length);
106+
107+
if (sequence.All(b => b >= 0x00 && b <= 0x7F))
108+
{
109+
validSequencesCount++;
110+
}
111+
112+
// Console.WriteLine($"{length}-byte sequence: {BitConverter.ToString(sequence)}"); // Print the sequence as hex bytes
113+
}
114+
115+
// Print the validation results
116+
Console.WriteLine($"For {length}-byte sequences, {validSequencesCount * 100.0 / NUM_TRIALS}% were valid ASCII.");
117+
118+
// Assertion or check to ensure all sequences were valid ASCII
119+
if (validSequencesCount != NUM_TRIALS)
120+
{
121+
throw new Exception($"Invalid ASCII sequences were generated for {length}-byte sequences!");
122+
}
123+
}
124+
}
125+
126+
88127
}

test/helpers/randomutf8.cs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
5+
public class RandomUtf8
6+
{
7+
// Internal random number generator
8+
private Random gen;
9+
10+
// Array of probabilities for each UTF-8 byte count (1-byte, 2-bytes, etc.)
11+
private double[] probabilities;
12+
13+
// Maximum number of bytes a UTF-8 character can be (based on the standard)
14+
private const int maxByteLength = 4;
15+
16+
// Constructor initializing the generator with seed and probabilities
17+
public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes)
18+
{
19+
gen = new Random((int)seed);
20+
probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes };
21+
}
22+
23+
// Generates a byte array of random UTF-8 sequences of specified length
24+
public byte[] Generate(int outputBytes)
25+
{
26+
List<byte> result = new List<byte>(outputBytes);
27+
while (result.Count < outputBytes)
28+
{
29+
uint codePoint = GenerateCodePoint();
30+
byte[] utf8Bytes = EncodeToUTF8(codePoint);
31+
32+
// Ensure we don't exceed the desired length
33+
if (result.Count + utf8Bytes.Length > outputBytes)
34+
break;
35+
36+
result.AddRange(utf8Bytes);
37+
}
38+
return result.ToArray();
39+
}
40+
41+
// Generates a byte array of random UTF-8 sequences and returns it along with its length
42+
public (byte[] utf8, int count) GenerateCounted(int outputBytes)
43+
{
44+
var utf8 = Generate(outputBytes);
45+
return (utf8, utf8.Length);
46+
}
47+
48+
// Overload to regenerate the byte sequence with a new seed
49+
public byte[] Generate(int outputBytes, long seed)
50+
{
51+
gen = new Random((int)seed);
52+
return Generate(outputBytes);
53+
}
54+
55+
// Generate a random UTF-8 code point based on probabilities
56+
private uint GenerateCodePoint()
57+
{
58+
int byteCount = PickRandomByteCount();
59+
60+
// Depending on the byte count, generate an appropriate UTF-8 sequence
61+
switch (byteCount)
62+
{
63+
// Each case follows UTF-8 encoding rules for 1-byte, 2-byte, 3-byte, and 4-byte sequences
64+
case 1: return (uint)gen.Next(0x00, 0x80); // 1-byte sequence
65+
case 2: return (uint)((gen.Next(0xC2, 0xDF) << 8) | (0x80 | gen.Next(0x00, 0x40)));
66+
case 3: return (uint)((gen.Next(0xE0, 0xEF) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
67+
case 4: return (uint)((gen.Next(0xF0, 0xF4) << 24) | ((0x80 | gen.Next(0x00, 0x40)) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
68+
default: throw new InvalidOperationException($"Invalid byte count: {byteCount}"); // Guard clause for invalid byte count
69+
}
70+
}
71+
72+
// Pick a random byte count based on the given probabilities
73+
private int PickRandomByteCount()
74+
{
75+
double randomValue = gen.NextDouble() * probabilities.Sum();
76+
double cumulative = 0.0;
77+
78+
// Check each cumulative probability until the random value is less than the cumulative sum
79+
for (int i = 0; i < maxByteLength; i++)
80+
{
81+
cumulative += probabilities[i];
82+
if (randomValue <= cumulative)
83+
return i + 1; // Return the byte count
84+
}
85+
86+
return maxByteLength; // Default to max byte length
87+
}
88+
89+
// Convert the generated code point into a valid UTF-8 sequence
90+
private byte[] EncodeToUTF8(uint codePoint)
91+
{
92+
var result = new List<byte>();
93+
94+
// Break the code point into its constituent bytes
95+
while (codePoint != 0)
96+
{
97+
result.Add((byte)(codePoint & 0xFF));
98+
codePoint >>= 8;
99+
}
100+
101+
result.Reverse(); // Reverse to get the bytes in the correct order
102+
return result.ToArray();
103+
}
104+
}

0 commit comments

Comments
 (0)