Skip to content

Commit 713aa3c

Browse files
authored
Merge pull request #1304 from capricorn86/1039-can-not-parse-correctly-html-with-nested-ul-and-li-tags
1039 can not parse correctly html with nested ul and li tags
2 parents 4970c69 + 23e9616 commit 713aa3c

File tree

11 files changed

+815
-200
lines changed

11 files changed

+815
-200
lines changed

packages/happy-dom/src/config/HTMLElementConfig.ts

Lines changed: 710 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
enum HTMLElementConfigContentModelEnum {
2+
rawText = 'rawText',
3+
noSelfDescendants = 'noSelfDescendants',
4+
noFirsLevelSelfDescendants = 'noFirsLevelSelfDescendants',
5+
noDescendants = 'noDescendants',
6+
anyDescendants = 'anyDescendants'
7+
}
8+
9+
export default HTMLElementConfigContentModelEnum;

packages/happy-dom/src/config/HTMLElementLocalNameToClass.ts

Lines changed: 0 additions & 119 deletions
This file was deleted.

packages/happy-dom/src/config/HTMLElementPlainText.ts

Lines changed: 0 additions & 4 deletions
This file was deleted.

packages/happy-dom/src/config/HTMLElementUnnestable.ts

Lines changed: 0 additions & 18 deletions
This file was deleted.

packages/happy-dom/src/config/HTMLElementVoid.ts

Lines changed: 0 additions & 16 deletions
This file was deleted.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import HTMLElementConfigContentModelEnum from './HTMLElementConfigContentModelEnum.js';
2+
3+
export default interface IHTMLElementConfigEntity {
4+
className: string;
5+
localName: string;
6+
tagName: string;
7+
contentModel: HTMLElementConfigContentModelEnum;
8+
}

packages/happy-dom/src/nodes/document/Document.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import DocumentFragment from '../document-fragment/DocumentFragment.js';
88
import XMLParser from '../../xml-parser/XMLParser.js';
99
import Event from '../../event/Event.js';
1010
import DOMImplementation from '../../dom-implementation/DOMImplementation.js';
11-
import HTMLElementLocalNameToClass from '../../config/HTMLElementLocalNameToClass.js';
1211
import INodeFilter from '../../tree-walker/INodeFilter.js';
1312
import NamespaceURI from '../../config/NamespaceURI.js';
1413
import DocumentType from '../document-type/DocumentType.js';
@@ -51,6 +50,7 @@ import ISVGElementTagNameMap from '../../config/ISVGElementTagNameMap.js';
5150
import ISVGElement from '../svg-element/ISVGElement.js';
5251
import IHTMLFormElement from '../html-form-element/IHTMLFormElement.js';
5352
import IHTMLAnchorElement from '../html-anchor-element/IHTMLAnchorElement.js';
53+
import HTMLElementConfig from '../../config/HTMLElementConfig.js';
5454

5555
const PROCESSING_INSTRUCTION_TARGET_REGEXP = /^[a-z][a-z0-9-]+$/;
5656

@@ -1131,7 +1131,9 @@ export default class Document extends Node implements IDocument {
11311131
}
11321132

11331133
const localName = qualifiedName.toLowerCase();
1134-
const elementClass = this[PropertySymbol.ownerWindow][HTMLElementLocalNameToClass[localName]];
1134+
const elementClass = HTMLElementConfig[localName]
1135+
? this[PropertySymbol.ownerWindow][HTMLElementConfig[localName].className]
1136+
: null;
11351137

11361138
// Known HTML element
11371139
if (elementClass) {

packages/happy-dom/src/xml-parser/XMLParser.ts

Lines changed: 38 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
import IDocument from '../nodes/document/IDocument.js';
22
import * as PropertySymbol from '../PropertySymbol.js';
3-
import HTMLElementVoid from '../config/HTMLElementVoid.js';
4-
import HTMLElementUnnestable from '../config/HTMLElementUnnestable.js';
53
import NamespaceURI from '../config/NamespaceURI.js';
64
import HTMLScriptElement from '../nodes/html-script-element/HTMLScriptElement.js';
75
import IElement from '../nodes/element/IElement.js';
86
import HTMLLinkElement from '../nodes/html-link-element/HTMLLinkElement.js';
9-
import HTMLElementPlainText from '../config/HTMLElementPlainText.js';
107
import IDocumentType from '../nodes/document-type/IDocumentType.js';
118
import INode from '../nodes/node/INode.js';
129
import IDocumentFragment from '../nodes/document-fragment/IDocumentFragment.js';
10+
import HTMLElementConfig from '../config/HTMLElementConfig.js';
1311
import * as Entities from 'entities';
12+
import HTMLElementConfigContentModelEnum from '../config/HTMLElementConfigContentModelEnum.js';
1413

1514
/**
1615
* Markup RegExp.
@@ -58,6 +57,8 @@ const DOCUMENT_TYPE_ATTRIBUTE_REGEXP = /"([^"]+)"/gm;
5857

5958
/**
6059
* XML parser.
60+
*
61+
* @see https://html.spec.whatwg.org/multipage/indices.html
6162
*/
6263
export default class XMLParser {
6364
/**
@@ -77,12 +78,11 @@ export default class XMLParser {
7778
): IElement | IDocumentFragment | IDocument {
7879
const root = options && options.rootNode ? options.rootNode : document.createDocumentFragment();
7980
const stack: INode[] = [root];
81+
const stackTagNames: string[] = [];
8082
const markupRegexp = new RegExp(MARKUP_REGEXP, 'gm');
8183
const { evaluateScripts = false } = options || {};
82-
const unnestableTagNames: string[] = [];
8384
let currentNode: INode | null = root;
8485
let match: RegExpExecArray;
85-
let plainTextTagName: string | null = null;
8686
let readState: MarkupReadStateEnum = MarkupReadStateEnum.startOrEndTag;
8787
let startTagIndex = 0;
8888
let lastIndex = 0;
@@ -108,19 +108,31 @@ export default class XMLParser {
108108
// Start tag.
109109
const tagName = match[1].toUpperCase();
110110
const localName = tagName === 'SVG' ? 'svg' : match[1];
111+
const config = HTMLElementConfig[localName];
111112

112113
// Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
113114
// Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
114-
const unnestableTagNameIndex = unnestableTagNames.indexOf(tagName);
115-
if (unnestableTagNameIndex !== -1) {
116-
unnestableTagNames.splice(unnestableTagNameIndex, 1);
115+
if (
116+
config?.contentModel ===
117+
HTMLElementConfigContentModelEnum.noFirsLevelSelfDescendants &&
118+
stackTagNames[stackTagNames.length - 1] === tagName
119+
) {
120+
stack.pop();
121+
stackTagNames.pop();
122+
currentNode = stack[stack.length - 1] || root;
123+
} else if (
124+
config?.contentModel === HTMLElementConfigContentModelEnum.noSelfDescendants &&
125+
stackTagNames.includes(tagName)
126+
) {
117127
while (currentNode !== root) {
118128
if ((<IElement>currentNode)[PropertySymbol.tagName].toUpperCase() === tagName) {
119129
stack.pop();
130+
stackTagNames.pop();
120131
currentNode = stack[stack.length - 1] || root;
121132
break;
122133
}
123134
stack.pop();
135+
stackTagNames.pop();
124136
currentNode = stack[stack.length - 1] || root;
125137
}
126138
}
@@ -136,25 +148,18 @@ export default class XMLParser {
136148
currentNode.appendChild(newElement);
137149
currentNode = newElement;
138150
stack.push(currentNode);
151+
stackTagNames.push(tagName);
139152
readState = MarkupReadStateEnum.insideStartTag;
140153
startTagIndex = markupRegexp.lastIndex;
141154
} else if (match[2]) {
142155
// End tag.
143156

144157
if (
145158
match[2].toUpperCase() ===
146-
(<IElement>currentNode)[PropertySymbol.tagName].toUpperCase()
159+
(<IElement>currentNode)[PropertySymbol.tagName]?.toUpperCase()
147160
) {
148-
// Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
149-
// Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
150-
const unnestableTagNameIndex = unnestableTagNames.indexOf(
151-
(<IElement>currentNode)[PropertySymbol.tagName].toUpperCase()
152-
);
153-
if (unnestableTagNameIndex !== -1) {
154-
unnestableTagNames.splice(unnestableTagNameIndex, 1);
155-
}
156-
157161
stack.pop();
162+
stackTagNames.pop();
158163
currentNode = stack[stack.length - 1] || root;
159164
}
160165
} else if (
@@ -201,8 +206,6 @@ export default class XMLParser {
201206
case MarkupReadStateEnum.insideStartTag:
202207
// End of start tag
203208
if (match[7] || match[8]) {
204-
// End of start tag.
205-
206209
// Attribute name and value.
207210

208211
const attributeString = xml.substring(startTagIndex, match.index);
@@ -257,33 +260,27 @@ export default class XMLParser {
257260
// We need to check if the attribute string is read completely.
258261
// The attribute string can potentially contain "/>" or ">".
259262
if (hasAttributeStringEnded) {
263+
const config = HTMLElementConfig[(<IElement>currentNode)[PropertySymbol.localName]];
264+
260265
// Checks if the tag is a self closing tag (ends with "/>") or void element.
261266
// When it is a self closing tag or void element it should be closed immediately.
262267
// Self closing tags are not allowed in the HTML namespace, but the parser should still allow it for void elements.
263268
// Self closing tags is supported in the SVG namespace.
264269
if (
265-
HTMLElementVoid[(<IElement>currentNode)[PropertySymbol.tagName]] ||
270+
config?.contentModel === HTMLElementConfigContentModelEnum.noDescendants ||
271+
// SVG tag is self closing (<svg/>).
266272
(match[7] &&
267273
(<IElement>currentNode)[PropertySymbol.namespaceURI] === NamespaceURI.svg)
268274
) {
269275
stack.pop();
276+
stackTagNames.pop();
270277
currentNode = stack[stack.length - 1] || root;
271278
readState = MarkupReadStateEnum.startOrEndTag;
272279
} else {
273-
// Plain text elements such as <script> and <style> should only contain text.
274-
plainTextTagName = HTMLElementPlainText[
275-
(<IElement>currentNode)[PropertySymbol.tagName]
276-
]
277-
? (<IElement>currentNode)[PropertySymbol.tagName]
278-
: null;
279-
280-
readState = !!plainTextTagName
281-
? MarkupReadStateEnum.plainTextContent
282-
: MarkupReadStateEnum.startOrEndTag;
283-
284-
if (HTMLElementUnnestable[(<IElement>currentNode)[PropertySymbol.tagName]]) {
285-
unnestableTagNames.push((<IElement>currentNode)[PropertySymbol.tagName]);
286-
}
280+
readState =
281+
config?.contentModel === HTMLElementConfigContentModelEnum.rawText
282+
? MarkupReadStateEnum.plainTextContent
283+
: MarkupReadStateEnum.startOrEndTag;
287284
}
288285

289286
startTagIndex = markupRegexp.lastIndex;
@@ -292,15 +289,17 @@ export default class XMLParser {
292289

293290
break;
294291
case MarkupReadStateEnum.plainTextContent:
295-
if (match[2] && match[2].toUpperCase() === plainTextTagName) {
292+
const tagName = currentNode[PropertySymbol.tagName];
293+
294+
if (tagName && match[2] && match[2].toUpperCase() === tagName) {
296295
// End of plain text tag.
297296

298297
// Scripts are not allowed to be executed when they are parsed using innerHTML, outerHTML, replaceWith() etc.
299298
// However, they are allowed to be executed when document.write() is used.
300299
// See: https://developer.mozilla.org/en-US/docs/Web/API/HTMLScriptElement
301-
if (plainTextTagName === 'SCRIPT') {
300+
if (tagName === 'SCRIPT') {
302301
(<HTMLScriptElement>currentNode)[PropertySymbol.evaluateScript] = evaluateScripts;
303-
} else if (plainTextTagName === 'LINK') {
302+
} else if (tagName === 'LINK') {
304303
// An assumption that the same rule should be applied for the HTMLLinkElement is made here.
305304
(<HTMLLinkElement>currentNode)[PropertySymbol.evaluateCSS] = evaluateScripts;
306305
}
@@ -313,8 +312,8 @@ export default class XMLParser {
313312
);
314313

315314
stack.pop();
315+
stackTagNames.pop();
316316
currentNode = stack[stack.length - 1] || root;
317-
plainTextTagName = null;
318317
readState = MarkupReadStateEnum.startOrEndTag;
319318
}
320319

0 commit comments

Comments
 (0)