From 71b86cfc7801ae4dc927411bcb79dbcbf22c9142 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Tue, 15 Jul 2025 18:19:28 -0500
Subject: [PATCH] HTML API: Reliably parse HTML in `get_url_in_content()`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trac ticket: Core-63694

This also decodes the URL whereas the previous code didn’t, so
strings like `http&#x3A;//` will be properly decoded as `http://`.
---
 src/wp-includes/formatting.php | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 7f69321a7199c..72fcb4e554c38 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -5978,16 +5978,20 @@ function wp_unslash( $value ) {
  *
  * @since 3.6.0
  *
- * @param string $content A string which might contain a URL.
- * @return string|false The found URL.
+ * @param string $content A string which might contain an `A` element with a non-empty `href` attribute.
+ * @return string|false Database-escaped URL via {@see esc_url()} if found, otherwise `false`.
  */
 function get_url_in_content( $content ) {
 	if ( empty( $content ) ) {
 		return false;
 	}
 
-	if ( preg_match( '/<a\s[^>]*?href=([\'"])(.+?)\1/is', $content, $matches ) ) {
-		return sanitize_url( $matches[2] );
+	$processor = new WP_HTML_Tag_Processor( $content );
+	while ( $processor->next_tag( 'A' ) ) {
+		$href = $processor->get_attribute( 'href' );
+		if ( is_string( $href ) && ! empty( $href ) ) {
+			return sanitize_url( $href );
+		}
 	}
 
 	return false;