Skip to content

Commit acacf10

Browse files
committed
[AutoFill Debugging] Add basic support for markdown as an output format
https://bugs.webkit.org/show_bug.cgi?id=303032 rdar://165316390 Reviewed by Richard Robinson. Add basic support for markdown as an output format. This covers support for `a` and `img` tags in `[]()` notation, along with unodered lists prefixed with `-`. Test: fast/text-extraction/debug-text-extraction-markdown.html * LayoutTests/fast/text-extraction/debug-text-extraction-markdown-expected.txt: Added. * LayoutTests/fast/text-extraction/debug-text-extraction-markdown.html: Added. * Source/WebKit/Shared/TextExtractionToStringConversion.cpp: (WebKit::escapeStringForMarkdown): (WebKit::TextExtractionAggregator::~TextExtractionAggregator): (WebKit::TextExtractionAggregator::addResult): (WebKit::TextExtractionAggregator::useMarkdownOutput const): (WebKit::TextExtractionAggregator::pushURLString): Add a mechanism to keep track of the current URL string when traversing into a link item. This allows us to insert markdown links as [text](http://url) when appending text inside of the link. (WebKit::TextExtractionAggregator::currentURLString const): (WebKit::TextExtractionAggregator::popURLString): (WebKit::TextExtractionAggregator::addLineForVersionNumberIfNeeded): (WebKit::addPartsForItem): (WebKit::addTextRepresentationRecursive): * Source/WebKit/Shared/TextExtractionToStringConversion.h: * Source/WebKit/UIProcess/API/Cocoa/WKWebView.mm: (textExtractionOutputFormat): * Source/WebKit/UIProcess/API/Cocoa/_WKTextExtraction.h: * Tools/WebKitTestRunner/cocoa/UIScriptControllerCocoa.mm: (WTR::createTextExtractionConfiguration): Canonical link: https://commits.webkit.org/303514@main
1 parent d4f46e5 commit acacf10

File tree

7 files changed

+155
-18
lines changed

7 files changed

+155
-18
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Here’s to the crazy ones. The misfits. The rebels. The troublemakers. The round pegs in the square holes.
2+
[T\[e\]st link](https://www.apple.com/)
3+
You can quote them, disagree with them, glorify or vilify them. About the only thing you can’t do is ignore them. Because they change things.
4+
![Purposefully broken image](file:///fake/image.png)
5+
This is a list:
6+
- foo
7+
- bar
8+
- baz
9+
<!-- version=2 -->
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
<!-- webkit-test-runner [ useFlexibleViewport=true textExtractionEnabled=true ] -->
2+
<!DOCTYPE html>
3+
<html>
4+
<meta name="viewport" content="width=device-width, initial-scale=1">
5+
<meta charset="utf-8">
6+
<head>
7+
<style>
8+
body {
9+
white-space: pre-wrap;
10+
}
11+
</style>
12+
<script src="../../resources/ui-helper.js"></script>
13+
</head>
14+
<body>
15+
<p>Here’s to the crazy ones. The misfits. The rebels. The troublemakers. The round pegs in the square holes.</p>
16+
<a href="https://www.apple.com">T[e]st link</a>
17+
<div>You can quote them, disagree with them, glorify or vilify them. About the only thing you can’t do is ignore them. Because they change things.</div>
18+
<img src="file:///fake/image.png" alt="Purposefully broken image"></img>
19+
<p>This is a list:</p>
20+
<ul>
21+
<li>foo</li>
22+
<li>bar</li>
23+
<li>baz</li>
24+
</ul>
25+
<script>
26+
addEventListener("load", async () => {
27+
if (!window.testRunner)
28+
return;
29+
30+
testRunner.dumpAsText();
31+
testRunner.waitUntilDone();
32+
33+
document.body.textContent = await UIHelper.requestDebugText({ outputFormat: "markdown" });
34+
35+
testRunner.notifyDone();
36+
});
37+
</script>
38+
</body>
39+
</html>

Source/WebKit/Shared/TextExtractionToStringConversion.cpp

Lines changed: 99 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <WebCore/HTMLNames.h>
3030
#include <WebCore/TextExtractionTypes.h>
3131
#include <wtf/EnumSet.h>
32+
#include <wtf/Scope.h>
3233
#include <wtf/TZoneMallocInlines.h>
3334
#include <wtf/text/MakeString.h>
3435
#include <wtf/text/StringBuilder.h>
@@ -82,6 +83,17 @@ static String escapeStringForHTML(const String& string)
8283
return result;
8384
}
8485

86+
static String escapeStringForMarkdown(const String& string)
87+
{
88+
auto result = string;
89+
result = makeStringByReplacingAll(result, '\\', "\\\\"_s);
90+
result = makeStringByReplacingAll(result, '[', "\\["_s);
91+
result = makeStringByReplacingAll(result, ']', "\\]"_s);
92+
result = makeStringByReplacingAll(result, '(', "\\("_s);
93+
result = makeStringByReplacingAll(result, ')', "\\)"_s);
94+
return result;
95+
}
96+
8597
static String normalizedURLString(const URL& url)
8698
{
8799
static constexpr auto maxURLStringLength = 150;
@@ -109,7 +121,9 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
109121
{
110122
addLineForNativeMenuItemsIfNeeded();
111123
addLineForVersionNumberIfNeeded();
112-
124+
m_lines.removeAllMatching([](auto& line) {
125+
return line.isEmpty();
126+
});
113127
m_completion({ makeStringByJoining(WTFMove(m_lines), "\n"_s), m_filteredOutAnyText });
114128
}
115129

@@ -129,12 +143,11 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
129143
return;
130144
}
131145

132-
auto separator = useHTMLOutput() ? " "_s : ","_s;
146+
auto separator = (useMarkdownOutput() || useHTMLOutput()) ? " "_s : ","_s;
133147
auto text = makeStringByJoining(WTFMove(components), separator);
134148

135149
if (!m_lines[lineIndex].isEmpty()) {
136-
auto joiner = useHTMLOutput() ? ' ' : ',';
137-
m_lines[lineIndex] = makeString(m_lines[lineIndex], joiner, WTFMove(text));
150+
m_lines[lineIndex] = makeString(m_lines[lineIndex], separator, WTFMove(text));
138151
return;
139152
}
140153

@@ -144,9 +157,13 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
144157
}
145158

146159
StringBuilder indentation;
147-
indentation.reserveCapacity(indentLevel);
148-
for (unsigned i = 0; i < indentLevel; ++i)
149-
indentation.append('\t');
160+
161+
if (!useMarkdownOutput()) {
162+
indentation.reserveCapacity(indentLevel);
163+
for (unsigned i = 0; i < indentLevel; ++i)
164+
indentation.append('\t');
165+
}
166+
150167
m_lines[lineIndex] = makeString(indentation.toString(), WTFMove(text));
151168
}
152169

@@ -182,6 +199,11 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
182199
return m_options.outputFormat == TextExtractionOutputFormat::HTMLMarkup;
183200
}
184201

202+
bool useMarkdownOutput() const
203+
{
204+
return m_options.outputFormat == TextExtractionOutputFormat::Markdown;
205+
}
206+
185207
RefPtr<TextExtractionFilterPromise> filter(const String& text, const std::optional<WebCore::NodeIdentifier>& identifier)
186208
{
187209
if (m_options.filterCallbacks.isEmpty())
@@ -212,6 +234,29 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
212234
m_lines[lineIndex] = makeString(m_lines[lineIndex], text);
213235
}
214236

237+
void pushURLString(String&& urlString)
238+
{
239+
m_urlStringStack.append(WTFMove(urlString));
240+
}
241+
242+
std::optional<String> currentURLString() const
243+
{
244+
if (m_urlStringStack.isEmpty())
245+
return std::nullopt;
246+
247+
return { m_urlStringStack.last() };
248+
}
249+
250+
void popURLString()
251+
{
252+
if (m_urlStringStack.isEmpty()) {
253+
ASSERT_NOT_REACHED();
254+
return;
255+
}
256+
257+
m_urlStringStack.removeLast();
258+
}
259+
215260
private:
216261
void filterRecursive(const String& originalText, const std::optional<WebCore::NodeIdentifier>& identifier, size_t index, CompletionHandler<void(String&&)>&& completion)
217262
{
@@ -250,7 +295,7 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
250295
if (onlyIncludeText())
251296
return;
252297

253-
auto versionText = useHTMLOutput() ? makeString("<!-- version="_s, version(), " -->"_s) : makeString("version="_s, version());
298+
auto versionText = (useHTMLOutput() || useMarkdownOutput()) ? makeString("<!-- version="_s, version(), " -->"_s) : makeString("version="_s, version());
254299
addResult({ advanceToNextLine(), 0 }, { WTFMove(versionText) });
255300
}
256301

@@ -261,6 +306,7 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
261306

262307
const TextExtractionOptions m_options;
263308
Vector<String> m_lines;
309+
Vector<String, 1> m_urlStringStack;
264310
unsigned m_nextLineIndex { 0 };
265311
CompletionHandler<void(TextExtractionResult&&)> m_completion;
266312
TextExtractionVersionBehaviors m_versionBehaviors;
@@ -325,9 +371,10 @@ static Vector<String> partsForItem(const TextExtraction::Item& item, const TextE
325371

326372
static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector<String>&& itemParts, std::optional<NodeIdentifier>&& enclosingNode, const TextExtractionLine& line, Ref<TextExtractionAggregator>&& aggregator, const String& closingTag = { })
327373
{
328-
auto completion = [itemParts = WTFMove(itemParts), selectedRange = textItem.selectedRange, aggregator, line, closingTag](String&& filteredText) mutable {
374+
auto completion = [itemParts = WTFMove(itemParts), selectedRange = textItem.selectedRange, aggregator, line, closingTag, urlString = aggregator->currentURLString()](String&& filteredText) mutable {
329375
Vector<String> textParts;
330376
auto currentLine = line;
377+
bool includeSelectionAsAttribute = !aggregator->useHTMLOutput() && !aggregator->useMarkdownOutput();
331378
if (!filteredText.isEmpty()) {
332379
// Apply replacements only after filtering, so any filtering steps that rely on comparing DOM text against
333380
// visual data (e.g. recognized text) won't result in false positives.
@@ -343,7 +390,7 @@ static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector
343390
});
344391

345392
if (startIndex == notFound) {
346-
if (!aggregator->useHTMLOutput()) {
393+
if (includeSelectionAsAttribute) {
347394
textParts.append("''"_s);
348395
textParts.append("selected=[0,0]"_s);
349396
}
@@ -363,10 +410,15 @@ static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector
363410
return;
364411
}
365412
textParts.append(escapeStringForHTML(trimmedContent));
413+
} else if (aggregator->useMarkdownOutput()) {
414+
if (urlString)
415+
textParts.append(makeString('[', escapeStringForMarkdown(trimmedContent), "]("_s, WTFMove(*urlString), ')'));
416+
else
417+
textParts.append(trimmedContent);
366418
} else
367419
textParts.append(makeString('\'', escapeString(trimmedContent), '\''));
368420

369-
if (!aggregator->useHTMLOutput() && selectedRange && selectedRange->length > 0) {
421+
if (includeSelectionAsAttribute && selectedRange && selectedRange->length > 0) {
370422
if (!trimmedContent.isEmpty()) {
371423
int newLocation = std::max(0, static_cast<int>(selectedRange->location) - static_cast<int>(startIndex));
372424
int maxLength = static_cast<int>(trimmedContent.length()) - newLocation;
@@ -379,7 +431,7 @@ static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector
379431
textParts.append("selected=[0,0]"_s);
380432
}
381433
}
382-
} else if (!aggregator->useHTMLOutput() && selectedRange)
434+
} else if (includeSelectionAsAttribute && selectedRange)
383435
textParts.append("selected=[0,0]"_s);
384436

385437
textParts.appendVector(WTFMove(itemParts));
@@ -461,6 +513,13 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
461513
else
462514
parts.append(makeString('<', tagName, ' ', makeStringByJoining(attributes, " "_s), '>'));
463515
}
516+
} else if (aggregator.useMarkdownOutput()) {
517+
if (containerType == TextExtraction::ContainerType::BlockQuote)
518+
parts.append(">"_s);
519+
else if (containerType == TextExtraction::ContainerType::ListItem) {
520+
// FIXME: Convert ordered lists into 1., 2., 3. etc.
521+
parts.append("-"_s);
522+
}
464523
} else {
465524
if (!containerString.isEmpty())
466525
parts.append(WTFMove(containerString));
@@ -484,7 +543,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
484543
parts.append("contenteditable='plaintext-only'"_s);
485544
else
486545
parts.append("contenteditable"_s);
487-
} else {
546+
} else if (!aggregator.useMarkdownOutput()) {
488547
parts.append("contentEditable"_s);
489548
parts.appendVector(partsForItem(item, aggregator));
490549

@@ -519,7 +578,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
519578
parts.append(makeString('<', tagName, '>'));
520579
else
521580
parts.append(makeString('<', tagName, ' ', makeStringByJoining(attributes, " "_s), '>'));
522-
} else {
581+
} else if (!aggregator.useMarkdownOutput()) {
523582
parts.append(WTFMove(tagName));
524583
parts.appendVector(partsForItem(item, aggregator));
525584

@@ -564,7 +623,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
564623
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), '>'));
565624
else
566625
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), ' ', makeStringByJoining(attributes, " "_s), '>'));
567-
} else {
626+
} else if (!aggregator.useMarkdownOutput()) {
568627
parts.append("link"_s);
569628
parts.appendVector(partsForItem(item, aggregator));
570629

@@ -581,7 +640,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
581640
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), '>'));
582641
else
583642
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), ' ', makeStringByJoining(attributes, " "_s), '>'));
584-
} else {
643+
} else if (!aggregator.useMarkdownOutput()) {
585644
parts.append("scrollable"_s);
586645
parts.appendVector(partsForItem(item, aggregator));
587646
parts.append(makeString("contentSize=["_s, scrollableData.contentSize.width(), 'x', scrollableData.contentSize.height(), ']'));
@@ -603,7 +662,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
603662
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), '>'));
604663
else
605664
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), ' ', makeStringByJoining(attributes, " "_s), '>'));
606-
} else {
665+
} else if (!aggregator.useMarkdownOutput()) {
607666
parts.append("select"_s);
608667
parts.appendVector(partsForItem(item, aggregator));
609668

@@ -634,6 +693,13 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
634693
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), '>'));
635694
else
636695
parts.append(makeString('<', item.nodeName.convertToASCIILowercase(), ' ', makeStringByJoining(attributes, " "_s), '>'));
696+
} else if (aggregator.useMarkdownOutput()) {
697+
String imageSource;
698+
if (auto attributeFromClient = item.clientAttributes.get("src"_s); !attributeFromClient.isEmpty())
699+
imageSource = WTFMove(attributeFromClient);
700+
else
701+
imageSource = normalizedURLString(imageData.completedSource);
702+
parts.append(makeString("!["_s, escapeStringForMarkdown(imageData.altText), "]("_s, WTFMove(imageSource), ')'));
637703
} else {
638704
parts.append("image"_s);
639705
parts.appendVector(partsForItem(item, aggregator));
@@ -685,6 +751,22 @@ static void addTextRepresentationRecursive(const TextExtraction::Item& item, std
685751
return;
686752
}
687753

754+
bool isLink = false;
755+
if (auto link = item.dataAs<TextExtraction::LinkItemData>()) {
756+
String linkURLString;
757+
if (auto attributeFromClient = item.clientAttributes.get("href"_s); !attributeFromClient.isEmpty())
758+
linkURLString = WTFMove(attributeFromClient);
759+
else
760+
linkURLString = normalizedURLString(link->completedURL);
761+
aggregator.pushURLString(WTFMove(linkURLString));
762+
isLink = true;
763+
}
764+
765+
auto popURLScope = makeScopeExit([isLink, &aggregator] {
766+
if (isLink)
767+
aggregator.popURLString();
768+
});
769+
688770
TextExtractionLine line { aggregator.advanceToNextLine(), depth };
689771
addPartsForItem(item, std::optional { identifier }, line, aggregator);
690772

Source/WebKit/Shared/TextExtractionToStringConversion.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ enum class TextExtractionOptionFlag : uint8_t {
5656

5757
enum class TextExtractionOutputFormat : uint8_t {
5858
TextTree,
59-
HTMLMarkup
59+
HTMLMarkup,
60+
Markdown
6061
};
6162

6263
using TextExtractionOptionFlags = OptionSet<TextExtractionOptionFlag>;

Source/WebKit/UIProcess/API/Cocoa/WKWebView.mm

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6620,6 +6620,8 @@ - (void)_debugTextWithConfiguration:(_WKTextExtractionConfiguration *)configurat
66206620
return WebKit::TextExtractionOutputFormat::TextTree;
66216621
case _WKTextExtractionOutputFormatHTML:
66226622
return WebKit::TextExtractionOutputFormat::HTMLMarkup;
6623+
case _WKTextExtractionOutputFormatMarkdown:
6624+
return WebKit::TextExtractionOutputFormat::Markdown;
66236625
default:
66246626
ASSERT_NOT_REACHED();
66256627
return WebKit::TextExtractionOutputFormat::TextTree;

Source/WebKit/UIProcess/API/Cocoa/_WKTextExtraction.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ typedef NS_ENUM(NSInteger, _WKTextExtractionNodeIdentifierInclusion) {
4848
typedef NS_ENUM(NSInteger, _WKTextExtractionOutputFormat) {
4949
_WKTextExtractionOutputFormatTextTree = 0,
5050
_WKTextExtractionOutputFormatHTML,
51+
_WKTextExtractionOutputFormatMarkdown,
5152
} WK_API_AVAILABLE(macos(WK_MAC_TBA), ios(WK_IOS_TBA), visionos(WK_XROS_TBA));
5253

5354
WK_CLASS_AVAILABLE(macos(WK_MAC_TBA), ios(WK_IOS_TBA), visionos(WK_XROS_TBA))

Tools/WebKitTestRunner/cocoa/UIScriptControllerCocoa.mm

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,9 @@ - (void)paste:(id)sender;
370370
if (equalLettersIgnoringASCIICase(outputFormat, "html"_s))
371371
return _WKTextExtractionOutputFormatHTML;
372372

373+
if (equalLettersIgnoringASCIICase(outputFormat, "markdown"_s))
374+
return _WKTextExtractionOutputFormatMarkdown;
375+
373376
if (equalLettersIgnoringASCIICase(outputFormat, "texttree"_s))
374377
return _WKTextExtractionOutputFormatTextTree;
375378

0 commit comments

Comments
 (0)