@@ -6,10 +6,10 @@ public class RandomUtf8
66{
77 // Internal random number generator
88 private Random gen ;
9-
9+
1010 // Array of probabilities for each UTF-8 byte count (1-byte, 2-bytes, etc.)
1111 private double [ ] probabilities ;
12-
12+
1313 // Maximum number of bytes a UTF-8 character can be (based on the standard)
1414 private const int maxByteLength = 4 ;
1515
@@ -28,11 +28,11 @@ public byte[] Generate(int outputBytes)
2828 {
2929 uint codePoint = GenerateCodePoint ( ) ;
3030 byte [ ] utf8Bytes = EncodeToUTF8 ( codePoint ) ;
31-
31+
3232 // Ensure we don't exceed the desired length
3333 if ( result . Count + utf8Bytes . Length > outputBytes )
3434 break ;
35-
35+
3636 result . AddRange ( utf8Bytes ) ;
3737 }
3838 return result . ToArray ( ) ;
@@ -56,13 +56,13 @@ public byte[] Generate(int outputBytes, long seed)
5656 private uint GenerateCodePoint ( )
5757 {
5858 int byteCount = PickRandomByteCount ( ) ;
59-
59+
6060 // Depending on the byte count, generate an appropriate UTF-8 sequence
6161 switch ( byteCount )
6262 {
6363 // Each case follows UTF-8 encoding rules for 1-byte, 2-byte, 3-byte, and 4-byte sequences
6464 case 1 : return ( uint ) gen . Next ( 0x00 , 0x80 ) ; // 1-byte sequence
65- case 2 : return ( uint ) ( ( gen . Next ( 0xC2 , 0xDF ) << 8 ) | ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) ) ;
65+ case 2 : return ( uint ) ( ( gen . Next ( 0xC2 , 0xDF ) << 8 ) | ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) ) ;
6666 case 3 : return ( uint ) ( ( gen . Next ( 0xE0 , 0xEF ) << 16 ) | ( ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) << 8 ) | ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) ) ;
6767 case 4 : return ( uint ) ( ( gen . Next ( 0xF0 , 0xF4 ) << 24 ) | ( ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) << 16 ) | ( ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) << 8 ) | ( 0x80 | gen . Next ( 0x00 , 0x40 ) ) ) ;
6868 default : throw new InvalidOperationException ( $ "Invalid byte count: { byteCount } ") ; // Guard clause for invalid byte count
@@ -74,30 +74,30 @@ private int PickRandomByteCount()
7474 {
7575 double randomValue = gen . NextDouble ( ) * probabilities . Sum ( ) ;
7676 double cumulative = 0.0 ;
77-
77+
7878 // Check each cumulative probability until the random value is less than the cumulative sum
7979 for ( int i = 0 ; i < maxByteLength ; i ++ )
8080 {
8181 cumulative += probabilities [ i ] ;
8282 if ( randomValue <= cumulative )
8383 return i + 1 ; // Return the byte count
8484 }
85-
85+
8686 return maxByteLength ; // Default to max byte length
8787 }
8888
8989 // Convert the generated code point into a valid UTF-8 sequence
9090 private byte [ ] EncodeToUTF8 ( uint codePoint )
9191 {
9292 var result = new List < byte > ( ) ;
93-
93+
9494 // Break the code point into its constituent bytes
9595 while ( codePoint != 0 )
9696 {
9797 result . Add ( ( byte ) ( codePoint & 0xFF ) ) ;
9898 codePoint >>= 8 ;
9999 }
100-
100+
101101 result . Reverse ( ) ; // Reverse to get the bytes in the correct order
102102 return result . ToArray ( ) ;
103103 }
0 commit comments