11import IDocument from '../nodes/document/IDocument.js' ;
22import * as PropertySymbol from '../PropertySymbol.js' ;
3- import HTMLElementVoid from '../config/HTMLElementVoid.js' ;
4- import HTMLElementUnnestable from '../config/HTMLElementUnnestable.js' ;
53import NamespaceURI from '../config/NamespaceURI.js' ;
64import HTMLScriptElement from '../nodes/html-script-element/HTMLScriptElement.js' ;
75import IElement from '../nodes/element/IElement.js' ;
86import HTMLLinkElement from '../nodes/html-link-element/HTMLLinkElement.js' ;
9- import HTMLElementPlainText from '../config/HTMLElementPlainText.js' ;
107import IDocumentType from '../nodes/document-type/IDocumentType.js' ;
118import INode from '../nodes/node/INode.js' ;
129import IDocumentFragment from '../nodes/document-fragment/IDocumentFragment.js' ;
10+ import HTMLElementConfig from '../config/HTMLElementConfig.js' ;
1311import * as Entities from 'entities' ;
12+ import HTMLElementConfigContentModelEnum from '../config/HTMLElementConfigContentModelEnum.js' ;
1413
1514/**
1615 * Markup RegExp.
@@ -58,6 +57,8 @@ const DOCUMENT_TYPE_ATTRIBUTE_REGEXP = /"([^"]+)"/gm;
5857
5958/**
6059 * XML parser.
60+ *
61+ * @see https://html.spec.whatwg.org/multipage/indices.html
6162 */
6263export default class XMLParser {
6364 /**
@@ -77,12 +78,11 @@ export default class XMLParser {
7778 ) : IElement | IDocumentFragment | IDocument {
7879 const root = options && options . rootNode ? options . rootNode : document . createDocumentFragment ( ) ;
7980 const stack : INode [ ] = [ root ] ;
81+ const stackTagNames : string [ ] = [ ] ;
8082 const markupRegexp = new RegExp ( MARKUP_REGEXP , 'gm' ) ;
8183 const { evaluateScripts = false } = options || { } ;
82- const unnestableTagNames : string [ ] = [ ] ;
8384 let currentNode : INode | null = root ;
8485 let match : RegExpExecArray ;
85- let plainTextTagName : string | null = null ;
8686 let readState : MarkupReadStateEnum = MarkupReadStateEnum . startOrEndTag ;
8787 let startTagIndex = 0 ;
8888 let lastIndex = 0 ;
@@ -108,19 +108,31 @@ export default class XMLParser {
108108 // Start tag.
109109 const tagName = match [ 1 ] . toUpperCase ( ) ;
110110 const localName = tagName === 'SVG' ? 'svg' : match [ 1 ] ;
111+ const config = HTMLElementConfig [ localName ] ;
111112
112113 // Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
113114 // Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
114- const unnestableTagNameIndex = unnestableTagNames . indexOf ( tagName ) ;
115- if ( unnestableTagNameIndex !== - 1 ) {
116- unnestableTagNames . splice ( unnestableTagNameIndex , 1 ) ;
115+ if (
116+ config ?. contentModel ===
117+ HTMLElementConfigContentModelEnum . noFirsLevelSelfDescendants &&
118+ stackTagNames [ stackTagNames . length - 1 ] === tagName
119+ ) {
120+ stack . pop ( ) ;
121+ stackTagNames . pop ( ) ;
122+ currentNode = stack [ stack . length - 1 ] || root ;
123+ } else if (
124+ config ?. contentModel === HTMLElementConfigContentModelEnum . noSelfDescendants &&
125+ stackTagNames . includes ( tagName )
126+ ) {
117127 while ( currentNode !== root ) {
118128 if ( ( < IElement > currentNode ) [ PropertySymbol . tagName ] . toUpperCase ( ) === tagName ) {
119129 stack . pop ( ) ;
130+ stackTagNames . pop ( ) ;
120131 currentNode = stack [ stack . length - 1 ] || root ;
121132 break ;
122133 }
123134 stack . pop ( ) ;
135+ stackTagNames . pop ( ) ;
124136 currentNode = stack [ stack . length - 1 ] || root ;
125137 }
126138 }
@@ -136,25 +148,18 @@ export default class XMLParser {
136148 currentNode . appendChild ( newElement ) ;
137149 currentNode = newElement ;
138150 stack . push ( currentNode ) ;
151+ stackTagNames . push ( tagName ) ;
139152 readState = MarkupReadStateEnum . insideStartTag ;
140153 startTagIndex = markupRegexp . lastIndex ;
141154 } else if ( match [ 2 ] ) {
142155 // End tag.
143156
144157 if (
145158 match [ 2 ] . toUpperCase ( ) ===
146- ( < IElement > currentNode ) [ PropertySymbol . tagName ] . toUpperCase ( )
159+ ( < IElement > currentNode ) [ PropertySymbol . tagName ] ? .toUpperCase ( )
147160 ) {
148- // Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
149- // Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
150- const unnestableTagNameIndex = unnestableTagNames . indexOf (
151- ( < IElement > currentNode ) [ PropertySymbol . tagName ] . toUpperCase ( )
152- ) ;
153- if ( unnestableTagNameIndex !== - 1 ) {
154- unnestableTagNames . splice ( unnestableTagNameIndex , 1 ) ;
155- }
156-
157161 stack . pop ( ) ;
162+ stackTagNames . pop ( ) ;
158163 currentNode = stack [ stack . length - 1 ] || root ;
159164 }
160165 } else if (
@@ -201,8 +206,6 @@ export default class XMLParser {
201206 case MarkupReadStateEnum . insideStartTag :
202207 // End of start tag
203208 if ( match [ 7 ] || match [ 8 ] ) {
204- // End of start tag.
205-
206209 // Attribute name and value.
207210
208211 const attributeString = xml . substring ( startTagIndex , match . index ) ;
@@ -257,33 +260,27 @@ export default class XMLParser {
257260 // We need to check if the attribute string is read completely.
258261 // The attribute string can potentially contain "/>" or ">".
259262 if ( hasAttributeStringEnded ) {
263+ const config = HTMLElementConfig [ ( < IElement > currentNode ) [ PropertySymbol . localName ] ] ;
264+
260265 // Checks if the tag is a self closing tag (ends with "/>") or void element.
261266 // When it is a self closing tag or void element it should be closed immediately.
262267 // Self closing tags are not allowed in the HTML namespace, but the parser should still allow it for void elements.
263268 // Self closing tags is supported in the SVG namespace.
264269 if (
265- HTMLElementVoid [ ( < IElement > currentNode ) [ PropertySymbol . tagName ] ] ||
270+ config ?. contentModel === HTMLElementConfigContentModelEnum . noDescendants ||
271+ // SVG tag is self closing (<svg/>).
266272 ( match [ 7 ] &&
267273 ( < IElement > currentNode ) [ PropertySymbol . namespaceURI ] === NamespaceURI . svg )
268274 ) {
269275 stack . pop ( ) ;
276+ stackTagNames . pop ( ) ;
270277 currentNode = stack [ stack . length - 1 ] || root ;
271278 readState = MarkupReadStateEnum . startOrEndTag ;
272279 } else {
273- // Plain text elements such as <script> and <style> should only contain text.
274- plainTextTagName = HTMLElementPlainText [
275- ( < IElement > currentNode ) [ PropertySymbol . tagName ]
276- ]
277- ? ( < IElement > currentNode ) [ PropertySymbol . tagName ]
278- : null ;
279-
280- readState = ! ! plainTextTagName
281- ? MarkupReadStateEnum . plainTextContent
282- : MarkupReadStateEnum . startOrEndTag ;
283-
284- if ( HTMLElementUnnestable [ ( < IElement > currentNode ) [ PropertySymbol . tagName ] ] ) {
285- unnestableTagNames . push ( ( < IElement > currentNode ) [ PropertySymbol . tagName ] ) ;
286- }
280+ readState =
281+ config ?. contentModel === HTMLElementConfigContentModelEnum . rawText
282+ ? MarkupReadStateEnum . plainTextContent
283+ : MarkupReadStateEnum . startOrEndTag ;
287284 }
288285
289286 startTagIndex = markupRegexp . lastIndex ;
@@ -292,15 +289,17 @@ export default class XMLParser {
292289
293290 break ;
294291 case MarkupReadStateEnum . plainTextContent :
295- if ( match [ 2 ] && match [ 2 ] . toUpperCase ( ) === plainTextTagName ) {
292+ const tagName = currentNode [ PropertySymbol . tagName ] ;
293+
294+ if ( tagName && match [ 2 ] && match [ 2 ] . toUpperCase ( ) === tagName ) {
296295 // End of plain text tag.
297296
298297 // Scripts are not allowed to be executed when they are parsed using innerHTML, outerHTML, replaceWith() etc.
299298 // However, they are allowed to be executed when document.write() is used.
300299 // See: https://developer.mozilla.org/en-US/docs/Web/API/HTMLScriptElement
301- if ( plainTextTagName === 'SCRIPT' ) {
300+ if ( tagName === 'SCRIPT' ) {
302301 ( < HTMLScriptElement > currentNode ) [ PropertySymbol . evaluateScript ] = evaluateScripts ;
303- } else if ( plainTextTagName === 'LINK' ) {
302+ } else if ( tagName === 'LINK' ) {
304303 // An assumption that the same rule should be applied for the HTMLLinkElement is made here.
305304 ( < HTMLLinkElement > currentNode ) [ PropertySymbol . evaluateCSS ] = evaluateScripts ;
306305 }
@@ -313,8 +312,8 @@ export default class XMLParser {
313312 ) ;
314313
315314 stack . pop ( ) ;
315+ stackTagNames . pop ( ) ;
316316 currentNode = stack [ stack . length - 1 ] || root ;
317- plainTextTagName = null ;
318317 readState = MarkupReadStateEnum . startOrEndTag ;
319318 }
320319
0 commit comments