Charset: Rely on new UTF-8 pipeline for mb_strlen() fallback.

dmsnell · dmsnell · commit b6b9cf881b91 · 2025-10-16T21:00:41.000Z
The existing polyfill for `mb_strlen()` contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of character to count them (1,000 at a time, iterating until complete), and entirely gives up when the Unicode support is missing. This patch provides an updated polyfill which will reliably count code points in a UTF-8 string, even in the presence of sequences of invalid bytes. It scans through the input with zero allocations. Additionally, the underlying fallback extends the behavior of `mb_strlen()` to provide character counts for substrings within a larger input without extracting the substring (it can counts characters within a byte offset and length of a larger string). This change improves the reliability of UTF-8 string length calculations and removes behavioral variability based on the runtime system. Developed in WordPress/wordpress-develop#9828 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. Built from https://develop.svn.wordpress.org/trunk@60949 git-svn-id: https://core.svn.wordpress.org/trunk@60285 1a063a9b-81f0-0310-95a4-ce76da25c4cd
diff --git a/wp-includes/compat-utf8.php b/wp-includes/compat-utf8.php
@@ -291,3 +291,49 @@ function _wp_scrub_utf8_fallback( string $bytes ): string {
 
 	return $scrubbed;
 }
+
+/**
+ * Returns how many code points are found in the given UTF-8 string.
+ *
+ * Invalid spans of bytes count as a single code point according
+ * to the maximal subpart rule. This function is a fallback method
+ * for calling `mb_strlen( $text, 'UTF-8' )`.
+ *
+ * When negative values are provided for the byte offsets or length,
+ * this will always report zero code points.
+ *
+ * Example:
+ *
+ *     4  === _wp_utf8_codepoint_count( 'text' );
+ *
+ *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
+ *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text            Count code points in this string.
+ * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
+ * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
+ *                                Default is to scan until the end of the string. Must be positive.
+ * @return int How many code points were found.
+ */
+function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+	if ( $byte_offset < 0 ) {
+		return 0;
+	}
+
+	$count           = 0;
+	$at              = $byte_offset;
+	$end             = strlen( $text );
+	$invalid_length  = 0;
+	$max_byte_length = min( $end - $at, $max_byte_length );
+
+	while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
+		$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
+		$count += $invalid_length > 0 ? 1 : 0;
+		$at    += $invalid_length;
+	}
+
+	return $count;
+}
diff --git a/wp-includes/compat.php b/wp-includes/compat.php
@@ -228,69 +228,23 @@ function mb_strlen( $string, $encoding = null ) { // phpcs:ignore Universal.Nami
 /**
  * Internal compat function to mimic mb_strlen().
  *
- * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
- * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
- * sequence. The behavior of this function for invalid inputs is undefined.
+ * Only supports UTF-8 and non-shifting single-byte encodings. For all other
+ * encodings expect the counts to be wrong. When the given encoding (or the
+ * `blog_charset` if none is provided) isn’t UTF-8 then the function returns
+ * the byte-count of the provided string.
  *
  * @ignore
  * @since 4.2.0
  *
  * @param string      $str      The string to retrieve the character length from.
- * @param string|null $encoding Optional. Character encoding to use. Default null.
- * @return int String length of `$str`.
+ * @param string|null $encoding Optional. Count characters according to this encoding.
+ *                              Default is to consult `blog_charset`.
+ * @return int Count of code points if UTF-8, byte length otherwise.
  */
 function _mb_strlen( $str, $encoding = null ) {
-	if ( null === $encoding ) {
-		$encoding = get_option( 'blog_charset' );
-	}
-
-	/*
-	 * The solution below works only for UTF-8, so in case of a different charset
-	 * just use built-in strlen().
-	 */
-	if ( ! _is_utf8_charset( $encoding ) ) {
-		return strlen( $str );
-	}
-
-	if ( _wp_can_use_pcre_u() ) {
-		// Use the regex unicode support to separate the UTF-8 characters into an array.
-		preg_match_all( '/./us', $str, $match );
-		return count( $match[0] );
-	}
-
-	$regex = '/(?:
-		[\x00-\x7F]                  # single-byte sequences   0xxxxxxx
-		| [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
-		| \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
-		| [\xE1-\xEC][\x80-\xBF]{2}
-		| \xED[\x80-\x9F][\x80-\xBF]
-		| [\xEE-\xEF][\x80-\xBF]{2}
-		| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
-		| [\xF1-\xF3][\x80-\xBF]{3}
-		| \xF4[\x80-\x8F][\x80-\xBF]{2}
-	)/x';
-
-	// Start at 1 instead of 0 since the first thing we do is decrement.
-	$count = 1;
-
-	do {
-		// We had some string left over from the last round, but we counted it in that last round.
-		--$count;
-
-		/*
-		 * Split by UTF-8 character, limit to 1000 characters (last array element will contain
-		 * the rest of the string).
-		 */
-		$pieces = preg_split( $regex, $str, 1000 );
-
-		// Increment.
-		$count += count( $pieces );
-
-		// If there's anything left over, repeat the loop.
-	} while ( $str = array_pop( $pieces ) );
-
-	// Fencepost: preg_split() always returns one extra item in the array.
-	return --$count;
+	return _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) )
+		? _wp_utf8_codepoint_count( $str )
+		: strlen( $str );
 }
 
 // sodium_crypto_box() was introduced in PHP 7.2.
diff --git a/wp-includes/version.php b/wp-includes/version.php
@@ -16,7 +16,7 @@
  *
  * @global string $wp_version
  */
-$wp_version = '6.9-alpha-60948';
+$wp_version = '6.9-alpha-60949';
 
 /**
  * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`	`*`
`17`	`17`	`* @global string $wp_version`
`18`	`18`	`*/`
`19`		`-$wp_version = '6.9-alpha-60948';`
	`19`	`+$wp_version = '6.9-alpha-60949';`
`20`	`20`
`21`	`21`	`/**`
`22`	`22`	`* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.`