Skip to content

Commit 2a169ab

Browse files
committed
Track whether noncharacters are in a string + wp_has_noncharacters()
1 parent 4892d46 commit 2a169ab

File tree

2 files changed

+60
-11
lines changed

2 files changed

+60
-11
lines changed

src/wp-includes/compat-utf8.php

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,21 @@
3535
* @since 6.9.0
3636
* @access private
3737
*
38-
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39-
* @param int $at Where to start scanning.
40-
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41-
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42-
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
38+
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39+
* @param int $at Where to start scanning.
40+
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41+
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42+
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
43+
* @param bool $has_noncharacters Set to indicate if scanned string contained noncharacters.
4344
* @return int How many code points were successfully scanned.
4445
*/
45-
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
46-
$byte_length = strlen( $bytes );
47-
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
48-
$invalid_length = 0;
49-
$count = 0;
50-
$max_count = $max_code_points ?? PHP_INT_MAX;
46+
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
47+
$byte_length = strlen( $bytes );
48+
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
49+
$invalid_length = 0;
50+
$count = 0;
51+
$max_count = $max_code_points ?? PHP_INT_MAX;
52+
$has_noncharacters = false;
5153

5254
for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
5355
/*
@@ -145,6 +147,18 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
145147
) {
146148
++$count;
147149
$i += 2;
150+
151+
// Covers the range U+FDD0–U+FDEF, U+FD.
152+
$is_noncharacter = (
153+
0xEF === $b1 &&
154+
(
155+
( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
156+
( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
157+
)
158+
);
159+
160+
$has_noncharacters = $has_noncharacters || $is_noncharacter;
161+
148162
continue;
149163
}
150164

@@ -162,6 +176,14 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
162176
) {
163177
++$count;
164178
$i += 3;
179+
180+
$is_noncharacter = (
181+
( 0x0F === ( $b2 & 0x0F ) ) &&
182+
0xBF === $b3 &&
183+
( 0xBE === $b4 || 0xBF === $b4 )
184+
);
185+
$has_noncharacters = $has_noncharacters || $is_noncharacter;
186+
165187
continue;
166188
}
167189

src/wp-includes/utf8.php

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,30 @@ function wp_scrub_utf8( $text ) {
133133
return _wp_scrub_utf8_fallback( $text );
134134
}
135135
endif;
136+
137+
/**
138+
* Returns whether the given string contains Unicode noncharacters.
139+
*
140+
* Noncharacters:
141+
* - U+FDD0–U+FDEF
142+
* - U+FFFE–U+FFFF
143+
* - U+1FFFx, U+2FFFx, …, U+FFFFx, U+10FFFx (where x is either E or F)
144+
*
145+
* @todo link to Unicode noncharacter spec.
146+
*
147+
* @param string $text Are there noncharacters in this string?
148+
* @return bool Whether noncharacters were found in the string.
149+
*/
150+
function wp_has_noncharacters( string $text ): bool {
151+
$at = 0;
152+
$invalid_length = 0;
153+
$has_noncharacters = false;
154+
$end = strlen( $text );
155+
156+
while ( $at < $end && !$has_noncharacters ) {
157+
_wp_scan_utf8( $text, $at, $invalid_length, null, $has_noncharacters );
158+
$at += $invalid_length;
159+
}
160+
161+
return $has_noncharacters;
162+
}

0 commit comments

Comments
 (0)