From 71b86cfc7801ae4dc927411bcb79dbcbf22c9142 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 15 Jul 2025 18:19:28 -0500 Subject: [PATCH] HTML API: Reliably parse HTML in `get_url_in_content()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trac ticket: Core-63694 This also decodes the URL whereas the previous code didn’t, so strings like `http://` will be properly decoded as `http://`. --- src/wp-includes/formatting.php | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 7f69321a7199c..72fcb4e554c38 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -5978,16 +5978,20 @@ function wp_unslash( $value ) { * * @since 3.6.0 * - * @param string $content A string which might contain a URL. - * @return string|false The found URL. + * @param string $content A string which might contain an `A` element with a non-empty `href` attribute. + * @return string|false Database-escaped URL via {@see esc_url()} if found, otherwise `false`. */ function get_url_in_content( $content ) { if ( empty( $content ) ) { return false; } - if ( preg_match( '/]*?href=([\'"])(.+?)\1/is', $content, $matches ) ) { - return sanitize_url( $matches[2] ); + $processor = new WP_HTML_Tag_Processor( $content ); + while ( $processor->next_tag( 'A' ) ) { + $href = $processor->get_attribute( 'href' ); + if ( is_string( $href ) && ! empty( $href ) ) { + return sanitize_url( $href ); + } } return false;