@@ -1231,6 +1231,246 @@ public function serialize(): ?string {
12311231 return $ html ;
12321232 }
12331233
1234+ /**
1235+ * Converts an HTML fragment to the XML syntax (XHTML).
1236+ *
1237+ * Warning! HTML cannot be fully expressed in the XML syntax and _**no**_ XML/XHTML
1238+ * should be served with the `Content-type: text/html` head - THIS IS DANGEROUS!!!
1239+ *
1240+ * Only call this function when directly embedding an HTML document into an XML
1241+ * document, such as when generating unescaped RSS feeds and WXR exports. Even
1242+ * still, the conversion may fail BECAUSE XML CANNOT REPRESENT ALL HTML DOCUMENTS.
1243+ *
1244+ * You probably want {@see static::serialize()} instead! HTML is _not_ XML - they
1245+ * are separate languages and represent different content models.
1246+ *
1247+ * > Using the XML syntax is not recommended
1248+ *
1249+ * @see https://html.spec.whatwg.org/#the-xhtml-syntax
1250+ *
1251+ * Many aspects of an input HTML fragment may be changed during normalization.
1252+ *
1253+ * - Attribute values will be double-quoted.
1254+ * - Duplicate attributes will be removed.
1255+ * - Omitted tags will be added.
1256+ * - Tag and attribute name casing will be lower-cased,
1257+ * except for specific SVG and MathML tags or attributes.
1258+ * - Text will be re-encoded, null bytes handled,
1259+ * and invalid UTF-8 replaced with U+FFFD.
1260+ * - Any incomplete syntax trailing at the end will be omitted,
1261+ * for example, an unclosed comment opener will be removed.
1262+ *
1263+ * Example:
1264+ *
1265+ * $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
1266+ * echo $processor->serialize_to_xml();
1267+ * // <a href="#anchor" v="5" enabled>One</a>
1268+ *
1269+ * $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
1270+ * echo $processor->serialize_to_xml();
1271+ * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
1272+ *
1273+ * $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
1274+ * echo $processor->serialize_to_xml();
1275+ * // <!--[CDATA[invalid comment]]--> syntax < <> "oddities"
1276+ *
1277+ * @since 6.7.0
1278+ *
1279+ * @return string|null Normalized XML markup represented by processor,
1280+ * or `null` if unable to generate serialization.
1281+ */
1282+ public function serialize_to_xml (): ?string {
1283+ if ( WP_HTML_Tag_Processor::STATE_READY !== $ this ->parser_state ) {
1284+ wp_trigger_error (
1285+ __METHOD__ ,
1286+ "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance. " ,
1287+ E_USER_WARNING
1288+ );
1289+ return null ;
1290+ }
1291+
1292+ $ html = isset ( $ this ->context_node ) ? '' : "<?xml version= \"1.0 \" encoding= \"UTF-8 \" ?> \n" ;
1293+ while ( $ this ->next_token () ) {
1294+ $ token_type = $ this ->get_token_type ();
1295+
1296+ // @todo Bail when content contains unallowed XML characters.
1297+ switch ( $ token_type ) {
1298+ case '#text ' :
1299+ $ html .= htmlspecialchars ( $ this ->get_modifiable_text (), ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1 , 'UTF-8 ' );
1300+ break ;
1301+
1302+ // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
1303+ case '#presumptuous-tag ' :
1304+ break ;
1305+
1306+ case '#funky-comment ' :
1307+ $ text = $ this ->get_modifiable_text ();
1308+ if ( str_contains ( $ text , '-- ' ) ) {
1309+ wp_trigger_error (
1310+ __METHOD__ ,
1311+ "XML cannot encode a comment with a double dash '--' inside of it. " ,
1312+ E_USER_WARNING
1313+ );
1314+ return null ;
1315+ }
1316+ $ html .= "<!-- {$ text }--> " ;
1317+ break ;
1318+
1319+ case '#comment ' :
1320+ $ text = $ this ->get_modifiable_text ();
1321+ if ( str_contains ( $ text , '-- ' ) ) {
1322+ wp_trigger_error (
1323+ __METHOD__ ,
1324+ "XML cannot encode a comment with a double dash '--' inside of it. " ,
1325+ E_USER_WARNING
1326+ );
1327+ return null ;
1328+ }
1329+
1330+ switch ( $ this ->get_comment_type () ) {
1331+ case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE :
1332+ $ html .= "<!--[CDATA[ {$ text }]]--> " ;
1333+ break ;
1334+
1335+ case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE :
1336+ $ html .= "<!--? {$ this ->get_tag ()}{$ text }?--> " ;
1337+ break ;
1338+
1339+ default :
1340+ $ html .= "<!-- {$ text }--> " ;
1341+ }
1342+ break ;
1343+
1344+ case '#cdata-section ' :
1345+ $ html .= "<![CDATA[ {$ this ->get_modifiable_text ()}]]> " ;
1346+ break ;
1347+ }
1348+
1349+ if ( '#tag ' !== $ token_type ) {
1350+ continue ;
1351+ }
1352+
1353+ $ tag_name = $ this ->get_tag ();
1354+ $ in_html = 'html ' === $ this ->get_namespace ();
1355+ $ is_void = $ in_html && static ::is_void ( $ tag_name );
1356+ $ qualified_name = $ in_html ? strtolower ( $ tag_name ) : $ this ->get_qualified_tag_name ();
1357+
1358+ if ( str_contains ( $ tag_name , ': ' ) ) {
1359+ wp_trigger_error (
1360+ __METHOD__ ,
1361+ "The element ' {$ tag_name }' cannot be expressed in the XML syntax because colon ':' conflates with the XML namespace separator. " ,
1362+ E_USER_WARNING
1363+ );
1364+ return null ;
1365+ }
1366+
1367+ // @todo Check Name production in XML and abort if name doesn't match.
1368+
1369+ if ( $ this ->is_tag_closer () ) {
1370+ $ html .= "</ {$ qualified_name }> " ;
1371+ continue ;
1372+ }
1373+
1374+ switch ( $ tag_name ) {
1375+ case 'MATH ' :
1376+ if ( 'math ' === $ this ->get_namespace () ) {
1377+ $ this ->set_attribute ( 'xmlns ' , 'http://www.w3.org/1998/Math/MathML ' );
1378+ $ this ->get_updated_html ();
1379+ }
1380+ break ;
1381+
1382+ case 'SVG ' :
1383+ if ( 'svg ' === $ this ->get_namespace () ) {
1384+ $ this ->set_attribute ( 'xmlns ' , 'http://www.w3.org/2000/svg ' );
1385+ $ this ->get_updated_html ();
1386+ }
1387+ break ;
1388+ }
1389+
1390+ if ( $ this ->is_html_integration_point () ) {
1391+ $ this ->set_attribute ( 'xmlns ' , 'http://www.w3.org/1999/xhtml ' );
1392+ $ this ->get_updated_html ();
1393+ }
1394+
1395+ $ attribute_names = $ this ->get_attribute_names_with_prefix ( '' );
1396+ if ( ! isset ( $ attribute_names ) && ! ( $ in_html && 'HTML ' === $ tag_name ) ) {
1397+ $ html .= $ is_void ? "< $ {qualified_name} /> " : "< {$ qualified_name }> " ;
1398+ continue ;
1399+ }
1400+
1401+ $ html .= "< {$ qualified_name }" ;
1402+ foreach ( $ attribute_names ?? array () as $ attribute_name ) {
1403+ if ( 'xmlns ' === $ attribute_name && $ in_html && ! in_array ( $ tag_name , array ( 'HTML ' , 'SVG ' , 'MATH ' ), true ) ) {
1404+ wp_trigger_error (
1405+ __METHOD__ ,
1406+ "The attribute 'xmlns' cannot be expressed in the XML syntax. " ,
1407+ E_USER_WARNING
1408+ );
1409+ return null ;
1410+ }
1411+
1412+ /*
1413+ * @todo Check all of the other adjusted foreign attributes, e.g. xlink:actuate or xml:lang.
1414+ *
1415+ * For example, if a tag contains `xlink:actuate` and also `actuate` then it must fail
1416+ * the conversion since it's ambiguous which one is valid. The same is true for the
1417+ * `xml:lang` and `lang` attributes.
1418+ *
1419+ * @see https://html.spec.whatwg.org/#adjust-foreign-attributes
1420+ */
1421+
1422+ $ html .= ' ' . str_replace ( ' ' , ': ' , $ this ->get_qualified_attribute_name ( $ attribute_name ) );
1423+ $ value = $ this ->get_attribute ( $ attribute_name );
1424+
1425+ if ( is_string ( $ value ) ) {
1426+ $ html .= '=" ' . htmlspecialchars ( $ value , ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1 ) . '" ' ;
1427+ } else {
1428+ $ html .= "= \"{$ attribute_name }\"" ;
1429+ }
1430+ }
1431+
1432+ // The HTML node is often virtual, so it's not possible to `set_attribute()` on it.
1433+ if ( $ in_html && 'HTML ' === $ tag_name && ! is_string ( $ this ->get_attribute ( 'xmlns ' ) ) ) {
1434+ $ html .= ' xmlns="http://www.w3.org/1999/xhtml" ' ;
1435+ }
1436+
1437+ if ( $ is_void || ( ! $ in_html && $ this ->has_self_closing_flag () ) ) {
1438+ $ html .= ' / ' ;
1439+ }
1440+
1441+ $ html .= '> ' ;
1442+
1443+ // Flush out self-contained elements.
1444+ if ( $ in_html && in_array ( $ tag_name , array ( 'IFRAME ' , 'NOEMBED ' , 'NOFRAMES ' , 'SCRIPT ' , 'STYLE ' , 'TEXTAREA ' , 'TITLE ' , 'XMP ' ), true ) ) {
1445+ $ text = $ this ->get_modifiable_text ();
1446+
1447+ switch ( $ tag_name ) {
1448+ case 'IFRAME ' :
1449+ case 'NOEMBED ' :
1450+ case 'NOFRAMES ' :
1451+ $ text = '' ;
1452+ break ;
1453+
1454+ default :
1455+ $ text = htmlspecialchars ( $ text , ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1 , 'UTF-8 ' );
1456+ }
1457+
1458+ $ html .= "{$ text }</ {$ qualified_name }> " ;
1459+ }
1460+ }
1461+
1462+ if ( null !== $ this ->get_last_error () ) {
1463+ wp_trigger_error (
1464+ __METHOD__ ,
1465+ "Cannot serialize HTML Processor with parsing error: {$ this ->get_last_error ()}. " ,
1466+ E_USER_WARNING
1467+ );
1468+ return null ;
1469+ }
1470+
1471+ return $ html ;
1472+ }
1473+
12341474 /**
12351475 * Parses next element in the 'initial' insertion mode.
12361476 *
0 commit comments