From 5d40f6bd4aecc02ad8cf1780cd4ef1cd7f27d910 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 15 Mar 2025 16:18:00 +0100 Subject: [PATCH] mbstring: Make encoding detection stricter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PHP 8.3 changed how source encoding detection works: https://www.php.net/manual/en/migration83.other-changes.php#migration83.other-changes.functions.mbstring Most locales only consider `ASCII` and `UTF-8` (see `mb_detect_order()`), and when a byte sequence invalid in both tested encodings (such as 0x91 for ‘ in Windows-1252) is encountered, one of them might now be chosen as the most fitting encoding. (This is done using the heuristics introduced in PHP 8.1: https://github.com/php/php-src/commit/28b346bc0678effe7f68339ad942be16d1e1a311) Compare the output of the following script across PHP versions: assertSupported($from); } + } else { + $from = 'auto'; } if ($to) { $this->assertSupported($to, false); } - $handleErrors = !$from || 'auto' === $from; - if ($handleErrors) { - set_error_handler( - function ($no, $warning) use ($string): void { - throw new UndetectableEncodingException($string, $warning); - }, - E_WARNING - ); + if ($from === 'auto') { + $from = mb_detect_encoding($string, 'auto', true); } - try { - $result = mb_convert_encoding( - $string, - $to ?: $this->defaultEncoding, - $from ?: 'auto' - ); - } finally { - if ($handleErrors) { - restore_error_handler(); - } + if ($from === false) { + throw new UndetectableEncodingException($string, 'Unable to detect character encoding'); } + $result = mb_convert_encoding( + $string, + $to ?: $this->defaultEncoding, + $from + ); + + // For PHPStan: We check the encoding is valid. + assert($result !== false); + return $result; } diff --git a/tests/MbTranscoderTest.php b/tests/MbTranscoderTest.php index 422fdbe..97e9077 100644 --- a/tests/MbTranscoderTest.php +++ b/tests/MbTranscoderTest.php @@ -55,7 +55,7 @@ public function testUndetectableEncoding(): void $this->expectException(\Ddeboer\Transcoder\Exception\UndetectableEncodingException::class); $this->expectExceptionMessage('is undetectable'); $result = $this->transcoder->transcode( - '‘curly quotes make this incompatible with 1252’', + '‘Windows-1252 encodes curly quotes as 0x91 and 0x92, which are indistinguishable from any other single-byte encoding’', null, 'windows-1252' );