3535 * @since 6.9.0
3636 * @access private
3737 *
38- * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39- * @param int $at Where to start scanning.
40- * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41- * @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42- * @param int|null $max_code_points Stop scanning after this many code points have been seen.
38+ * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39+ * @param int $at Where to start scanning.
40+ * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41+ * @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42+ * @param int|null $max_code_points Stop scanning after this many code points have been seen.
43+ * @param bool $has_noncharacters Set to indicate if scanned string contained noncharacters.
4344 * @return int How many code points were successfully scanned.
4445 */
45- function _wp_scan_utf8 ( string $ bytes , int &$ at , int &$ invalid_length , ?int $ max_bytes = null , ?int $ max_code_points = null ): int {
46+ function _wp_scan_utf8 ( string $ bytes , int &$ at , int &$ invalid_length , ?int $ max_bytes = null , ?int $ max_code_points = null , ? bool & $ has_noncharacters = null ): int {
4647 $ byte_length = strlen ( $ bytes );
4748 $ end = min ( $ byte_length , $ at + ( $ max_bytes ?? PHP_INT_MAX ) );
4849 $ invalid_length = 0 ;
4950 $ count = 0 ;
5051 $ max_count = $ max_code_points ?? PHP_INT_MAX ;
52+ $ has_noncharacters = false ;
5153
5254 for ( $ i = $ at ; $ i < $ end && $ count <= $ max_count ; $ i ++ ) {
5355 /*
@@ -145,6 +147,18 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
145147 ) {
146148 ++$ count ;
147149 $ i += 2 ;
150+
151+ // Covers the range U+FDD0–U+FDEF, U+FD.
152+ $ is_noncharacter = (
153+ 0xEF === $ b1 &&
154+ (
155+ ( 0xB7 === $ b2 && $ b3 >= 0x90 && $ b3 <= 0xAF ) ||
156+ ( 0xBF === $ b2 && ( 0xBE === $ b3 || 0xBF === $ b3 ) )
157+ )
158+ );
159+
160+ $ has_noncharacters = $ has_noncharacters || $ is_noncharacter ;
161+
148162 continue ;
149163 }
150164
@@ -162,6 +176,14 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
162176 ) {
163177 ++$ count ;
164178 $ i += 3 ;
179+
180+ $ is_noncharacter = (
181+ ( 0x0F === ( $ b2 & 0x0F ) ) &&
182+ 0xBF === $ b3 &&
183+ ( 0xBE === $ b4 || 0xBF === $ b4 )
184+ );
185+ $ has_noncharacters = $ has_noncharacters || $ is_noncharacter ;
186+
165187 continue ;
166188 }
167189
0 commit comments