Skip to content

Commit f914c3f

Browse files
committed
Add an XML serializer.
1 parent 18b5005 commit f914c3f

1 file changed

Lines changed: 240 additions & 0 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,6 +1231,246 @@ public function serialize(): ?string {
12311231
return $html;
12321232
}
12331233

1234+
/**
1235+
* Converts an HTML fragment to the XML syntax (XHTML).
1236+
*
1237+
* Warning! HTML cannot be fully expressed in the XML syntax and _**no**_ XML/XHTML
1238+
* should be served with the `Content-type: text/html` head - THIS IS DANGEROUS!!!
1239+
*
1240+
* Only call this function when directly embedding an HTML document into an XML
1241+
* document, such as when generating unescaped RSS feeds and WXR exports. Even
1242+
* still, the conversion may fail BECAUSE XML CANNOT REPRESENT ALL HTML DOCUMENTS.
1243+
*
1244+
* You probably want {@see static::serialize()} instead! HTML is _not_ XML - they
1245+
* are separate languages and represent different content models.
1246+
*
1247+
* > Using the XML syntax is not recommended
1248+
*
1249+
* @see https://html.spec.whatwg.org/#the-xhtml-syntax
1250+
*
1251+
* Many aspects of an input HTML fragment may be changed during normalization.
1252+
*
1253+
* - Attribute values will be double-quoted.
1254+
* - Duplicate attributes will be removed.
1255+
* - Omitted tags will be added.
1256+
* - Tag and attribute name casing will be lower-cased,
1257+
* except for specific SVG and MathML tags or attributes.
1258+
* - Text will be re-encoded, null bytes handled,
1259+
* and invalid UTF-8 replaced with U+FFFD.
1260+
* - Any incomplete syntax trailing at the end will be omitted,
1261+
* for example, an unclosed comment opener will be removed.
1262+
*
1263+
* Example:
1264+
*
1265+
* $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
1266+
* echo $processor->serialize_to_xml();
1267+
* // <a href="#anchor" v="5" enabled>One</a>
1268+
*
1269+
* $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
1270+
* echo $processor->serialize_to_xml();
1271+
* // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
1272+
*
1273+
* $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
1274+
* echo $processor->serialize_to_xml();
1275+
* // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
1276+
*
1277+
* @since 6.7.0
1278+
*
1279+
* @return string|null Normalized XML markup represented by processor,
1280+
* or `null` if unable to generate serialization.
1281+
*/
1282+
public function serialize_to_xml(): ?string {
1283+
if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
1284+
wp_trigger_error(
1285+
__METHOD__,
1286+
"An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.",
1287+
E_USER_WARNING
1288+
);
1289+
return null;
1290+
}
1291+
1292+
$html = isset( $this->context_node ) ? '' : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
1293+
while ( $this->next_token() ) {
1294+
$token_type = $this->get_token_type();
1295+
1296+
// @todo Bail when content contains unallowed XML characters.
1297+
switch ( $token_type ) {
1298+
case '#text':
1299+
$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1, 'UTF-8' );
1300+
break;
1301+
1302+
// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
1303+
case '#presumptuous-tag':
1304+
break;
1305+
1306+
case '#funky-comment':
1307+
$text = $this->get_modifiable_text();
1308+
if ( str_contains( $text, '--' ) ) {
1309+
wp_trigger_error(
1310+
__METHOD__,
1311+
"XML cannot encode a comment with a double dash '--' inside of it.",
1312+
E_USER_WARNING
1313+
);
1314+
return null;
1315+
}
1316+
$html .= "<!--{$text}-->";
1317+
break;
1318+
1319+
case '#comment':
1320+
$text = $this->get_modifiable_text();
1321+
if ( str_contains( $text, '--' ) ) {
1322+
wp_trigger_error(
1323+
__METHOD__,
1324+
"XML cannot encode a comment with a double dash '--' inside of it.",
1325+
E_USER_WARNING
1326+
);
1327+
return null;
1328+
}
1329+
1330+
switch ( $this->get_comment_type() ) {
1331+
case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE:
1332+
$html .= "<!--[CDATA[{$text}]]-->";
1333+
break;
1334+
1335+
case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE:
1336+
$html .= "<!--?{$this->get_tag()}{$text}?-->";
1337+
break;
1338+
1339+
default:
1340+
$html .= "<!--{$text}-->";
1341+
}
1342+
break;
1343+
1344+
case '#cdata-section':
1345+
$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
1346+
break;
1347+
}
1348+
1349+
if ( '#tag' !== $token_type ) {
1350+
continue;
1351+
}
1352+
1353+
$tag_name = $this->get_tag();
1354+
$in_html = 'html' === $this->get_namespace();
1355+
$is_void = $in_html && static::is_void( $tag_name );
1356+
$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
1357+
1358+
if ( str_contains( $tag_name, ':' ) ) {
1359+
wp_trigger_error(
1360+
__METHOD__,
1361+
"The element '{$tag_name}' cannot be expressed in the XML syntax because colon ':' conflates with the XML namespace separator.",
1362+
E_USER_WARNING
1363+
);
1364+
return null;
1365+
}
1366+
1367+
// @todo Check Name production in XML and abort if name doesn't match.
1368+
1369+
if ( $this->is_tag_closer() ) {
1370+
$html .= "</{$qualified_name}>";
1371+
continue;
1372+
}
1373+
1374+
switch ( $tag_name ) {
1375+
case 'MATH':
1376+
if ( 'math' === $this->get_namespace() ) {
1377+
$this->set_attribute( 'xmlns', 'http://www.w3.org/1998/Math/MathML' );
1378+
$this->get_updated_html();
1379+
}
1380+
break;
1381+
1382+
case 'SVG':
1383+
if ( 'svg' === $this->get_namespace() ) {
1384+
$this->set_attribute( 'xmlns', 'http://www.w3.org/2000/svg' );
1385+
$this->get_updated_html();
1386+
}
1387+
break;
1388+
}
1389+
1390+
if ( $this->is_html_integration_point() ) {
1391+
$this->set_attribute( 'xmlns', 'http://www.w3.org/1999/xhtml' );
1392+
$this->get_updated_html();
1393+
}
1394+
1395+
$attribute_names = $this->get_attribute_names_with_prefix( '' );
1396+
if ( ! isset( $attribute_names ) && ! ( $in_html && 'HTML' === $tag_name ) ) {
1397+
$html .= $is_void ? "<${qualified_name} />" : "<{$qualified_name}>";
1398+
continue;
1399+
}
1400+
1401+
$html .= "<{$qualified_name}";
1402+
foreach ( $attribute_names ?? array() as $attribute_name ) {
1403+
if ( 'xmlns' === $attribute_name && $in_html && ! in_array( $tag_name, array( 'HTML', 'SVG', 'MATH' ), true ) ) {
1404+
wp_trigger_error(
1405+
__METHOD__,
1406+
"The attribute 'xmlns' cannot be expressed in the XML syntax.",
1407+
E_USER_WARNING
1408+
);
1409+
return null;
1410+
}
1411+
1412+
/*
1413+
* @todo Check all of the other adjusted foreign attributes, e.g. xlink:actuate or xml:lang.
1414+
*
1415+
* For example, if a tag contains `xlink:actuate` and also `actuate` then it must fail
1416+
* the conversion since it's ambiguous which one is valid. The same is true for the
1417+
* `xml:lang` and `lang` attributes.
1418+
*
1419+
* @see https://html.spec.whatwg.org/#adjust-foreign-attributes
1420+
*/
1421+
1422+
$html .= ' ' . str_replace( ' ', ':', $this->get_qualified_attribute_name( $attribute_name ) );
1423+
$value = $this->get_attribute( $attribute_name );
1424+
1425+
if ( is_string( $value ) ) {
1426+
$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1 ) . '"';
1427+
} else {
1428+
$html .= "=\"{$attribute_name}\"";
1429+
}
1430+
}
1431+
1432+
// The HTML node is often virtual, so it's not possible to `set_attribute()` on it.
1433+
if ( $in_html && 'HTML' === $tag_name && ! is_string( $this->get_attribute( 'xmlns' ) ) ) {
1434+
$html .= ' xmlns="http://www.w3.org/1999/xhtml"';
1435+
}
1436+
1437+
if ( $is_void || ( ! $in_html && $this->has_self_closing_flag() ) ) {
1438+
$html .= ' /';
1439+
}
1440+
1441+
$html .= '>';
1442+
1443+
// Flush out self-contained elements.
1444+
if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
1445+
$text = $this->get_modifiable_text();
1446+
1447+
switch ( $tag_name ) {
1448+
case 'IFRAME':
1449+
case 'NOEMBED':
1450+
case 'NOFRAMES':
1451+
$text = '';
1452+
break;
1453+
1454+
default:
1455+
$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1, 'UTF-8' );
1456+
}
1457+
1458+
$html .= "{$text}</{$qualified_name}>";
1459+
}
1460+
}
1461+
1462+
if ( null !== $this->get_last_error() ) {
1463+
wp_trigger_error(
1464+
__METHOD__,
1465+
"Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
1466+
E_USER_WARNING
1467+
);
1468+
return null;
1469+
}
1470+
1471+
return $html;
1472+
}
1473+
12341474
/**
12351475
* Parses next element in the 'initial' insertion mode.
12361476
*

0 commit comments

Comments
 (0)