2929#include < WebCore/HTMLNames.h>
3030#include < WebCore/TextExtractionTypes.h>
3131#include < wtf/EnumSet.h>
32+ #include < wtf/Scope.h>
3233#include < wtf/TZoneMallocInlines.h>
3334#include < wtf/text/MakeString.h>
3435#include < wtf/text/StringBuilder.h>
@@ -82,6 +83,17 @@ static String escapeStringForHTML(const String& string)
8283 return result;
8384}
8485
86+ static String escapeStringForMarkdown (const String& string)
87+ {
88+ auto result = string;
89+ result = makeStringByReplacingAll (result, ' \\ ' , " \\\\ " _s);
90+ result = makeStringByReplacingAll (result, ' [' , " \\ [" _s);
91+ result = makeStringByReplacingAll (result, ' ]' , " \\ ]" _s);
92+ result = makeStringByReplacingAll (result, ' (' , " \\ (" _s);
93+ result = makeStringByReplacingAll (result, ' )' , " \\ )" _s);
94+ return result;
95+ }
96+
8597static String normalizedURLString (const URL& url)
8698{
8799 static constexpr auto maxURLStringLength = 150 ;
@@ -109,7 +121,9 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
109121 {
110122 addLineForNativeMenuItemsIfNeeded ();
111123 addLineForVersionNumberIfNeeded ();
112-
124+ m_lines.removeAllMatching ([](auto & line) {
125+ return line.isEmpty ();
126+ });
113127 m_completion ({ makeStringByJoining (WTFMove (m_lines), " \n " _s), m_filteredOutAnyText });
114128 }
115129
@@ -129,12 +143,11 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
129143 return ;
130144 }
131145
132- auto separator = useHTMLOutput () ? " " _s : " ," _s;
146+ auto separator = ( useMarkdownOutput () || useHTMLOutput () ) ? " " _s : " ," _s;
133147 auto text = makeStringByJoining (WTFMove (components), separator);
134148
135149 if (!m_lines[lineIndex].isEmpty ()) {
136- auto joiner = useHTMLOutput () ? ' ' : ' ,' ;
137- m_lines[lineIndex] = makeString (m_lines[lineIndex], joiner, WTFMove (text));
150+ m_lines[lineIndex] = makeString (m_lines[lineIndex], separator, WTFMove (text));
138151 return ;
139152 }
140153
@@ -144,9 +157,13 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
144157 }
145158
146159 StringBuilder indentation;
147- indentation.reserveCapacity (indentLevel);
148- for (unsigned i = 0 ; i < indentLevel; ++i)
149- indentation.append (' \t ' );
160+
161+ if (!useMarkdownOutput ()) {
162+ indentation.reserveCapacity (indentLevel);
163+ for (unsigned i = 0 ; i < indentLevel; ++i)
164+ indentation.append (' \t ' );
165+ }
166+
150167 m_lines[lineIndex] = makeString (indentation.toString (), WTFMove (text));
151168 }
152169
@@ -182,6 +199,11 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
182199 return m_options.outputFormat == TextExtractionOutputFormat::HTMLMarkup;
183200 }
184201
202+ bool useMarkdownOutput () const
203+ {
204+ return m_options.outputFormat == TextExtractionOutputFormat::Markdown;
205+ }
206+
185207 RefPtr<TextExtractionFilterPromise> filter (const String& text, const std::optional<WebCore::NodeIdentifier>& identifier)
186208 {
187209 if (m_options.filterCallbacks .isEmpty ())
@@ -212,6 +234,29 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
212234 m_lines[lineIndex] = makeString (m_lines[lineIndex], text);
213235 }
214236
237+ void pushURLString (String&& urlString)
238+ {
239+ m_urlStringStack.append (WTFMove (urlString));
240+ }
241+
242+ std::optional<String> currentURLString () const
243+ {
244+ if (m_urlStringStack.isEmpty ())
245+ return std::nullopt ;
246+
247+ return { m_urlStringStack.last () };
248+ }
249+
250+ void popURLString ()
251+ {
252+ if (m_urlStringStack.isEmpty ()) {
253+ ASSERT_NOT_REACHED ();
254+ return ;
255+ }
256+
257+ m_urlStringStack.removeLast ();
258+ }
259+
215260private:
216261 void filterRecursive (const String& originalText, const std::optional<WebCore::NodeIdentifier>& identifier, size_t index, CompletionHandler<void (String&&)>&& completion)
217262 {
@@ -250,7 +295,7 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
250295 if (onlyIncludeText ())
251296 return ;
252297
253- auto versionText = useHTMLOutput () ? makeString (" <!-- version=" _s, version (), " -->" _s) : makeString (" version=" _s, version ());
298+ auto versionText = ( useHTMLOutput () || useMarkdownOutput () ) ? makeString (" <!-- version=" _s, version (), " -->" _s) : makeString (" version=" _s, version ());
254299 addResult ({ advanceToNextLine (), 0 }, { WTFMove (versionText) });
255300 }
256301
@@ -261,6 +306,7 @@ class TextExtractionAggregator : public RefCounted<TextExtractionAggregator> {
261306
262307 const TextExtractionOptions m_options;
263308 Vector<String> m_lines;
309+ Vector<String, 1 > m_urlStringStack;
264310 unsigned m_nextLineIndex { 0 };
265311 CompletionHandler<void (TextExtractionResult&&)> m_completion;
266312 TextExtractionVersionBehaviors m_versionBehaviors;
@@ -325,9 +371,10 @@ static Vector<String> partsForItem(const TextExtraction::Item& item, const TextE
325371
326372static void addPartsForText (const TextExtraction::TextItemData& textItem, Vector<String>&& itemParts, std::optional<NodeIdentifier>&& enclosingNode, const TextExtractionLine& line, Ref<TextExtractionAggregator>&& aggregator, const String& closingTag = { })
327373{
328- auto completion = [itemParts = WTFMove (itemParts), selectedRange = textItem.selectedRange , aggregator, line, closingTag](String&& filteredText) mutable {
374+ auto completion = [itemParts = WTFMove (itemParts), selectedRange = textItem.selectedRange , aggregator, line, closingTag, urlString = aggregator-> currentURLString () ](String&& filteredText) mutable {
329375 Vector<String> textParts;
330376 auto currentLine = line;
377+ bool includeSelectionAsAttribute = !aggregator->useHTMLOutput () && !aggregator->useMarkdownOutput ();
331378 if (!filteredText.isEmpty ()) {
332379 // Apply replacements only after filtering, so any filtering steps that rely on comparing DOM text against
333380 // visual data (e.g. recognized text) won't result in false positives.
@@ -343,7 +390,7 @@ static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector
343390 });
344391
345392 if (startIndex == notFound) {
346- if (!aggregator-> useHTMLOutput () ) {
393+ if (includeSelectionAsAttribute ) {
347394 textParts.append (" ''" _s);
348395 textParts.append (" selected=[0,0]" _s);
349396 }
@@ -363,10 +410,15 @@ static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector
363410 return ;
364411 }
365412 textParts.append (escapeStringForHTML (trimmedContent));
413+ } else if (aggregator->useMarkdownOutput ()) {
414+ if (urlString)
415+ textParts.append (makeString (' [' , escapeStringForMarkdown (trimmedContent), " ](" _s, WTFMove (*urlString), ' )' ));
416+ else
417+ textParts.append (trimmedContent);
366418 } else
367419 textParts.append (makeString (' \' ' , escapeString (trimmedContent), ' \' ' ));
368420
369- if (!aggregator-> useHTMLOutput () && selectedRange && selectedRange->length > 0 ) {
421+ if (includeSelectionAsAttribute && selectedRange && selectedRange->length > 0 ) {
370422 if (!trimmedContent.isEmpty ()) {
371423 int newLocation = std::max (0 , static_cast <int >(selectedRange->location ) - static_cast <int >(startIndex));
372424 int maxLength = static_cast <int >(trimmedContent.length ()) - newLocation;
@@ -379,7 +431,7 @@ static void addPartsForText(const TextExtraction::TextItemData& textItem, Vector
379431 textParts.append (" selected=[0,0]" _s);
380432 }
381433 }
382- } else if (!aggregator-> useHTMLOutput () && selectedRange)
434+ } else if (includeSelectionAsAttribute && selectedRange)
383435 textParts.append (" selected=[0,0]" _s);
384436
385437 textParts.appendVector (WTFMove (itemParts));
@@ -461,6 +513,13 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
461513 else
462514 parts.append (makeString (' <' , tagName, ' ' , makeStringByJoining (attributes, " " _s), ' >' ));
463515 }
516+ } else if (aggregator.useMarkdownOutput ()) {
517+ if (containerType == TextExtraction::ContainerType::BlockQuote)
518+ parts.append (" >" _s);
519+ else if (containerType == TextExtraction::ContainerType::ListItem) {
520+ // FIXME: Convert ordered lists into 1., 2., 3. etc.
521+ parts.append (" -" _s);
522+ }
464523 } else {
465524 if (!containerString.isEmpty ())
466525 parts.append (WTFMove (containerString));
@@ -484,7 +543,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
484543 parts.append (" contenteditable='plaintext-only'" _s);
485544 else
486545 parts.append (" contenteditable" _s);
487- } else {
546+ } else if (!aggregator. useMarkdownOutput ()) {
488547 parts.append (" contentEditable" _s);
489548 parts.appendVector (partsForItem (item, aggregator));
490549
@@ -519,7 +578,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
519578 parts.append (makeString (' <' , tagName, ' >' ));
520579 else
521580 parts.append (makeString (' <' , tagName, ' ' , makeStringByJoining (attributes, " " _s), ' >' ));
522- } else {
581+ } else if (!aggregator. useMarkdownOutput ()) {
523582 parts.append (WTFMove (tagName));
524583 parts.appendVector (partsForItem (item, aggregator));
525584
@@ -564,7 +623,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
564623 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' >' ));
565624 else
566625 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' ' , makeStringByJoining (attributes, " " _s), ' >' ));
567- } else {
626+ } else if (!aggregator. useMarkdownOutput ()) {
568627 parts.append (" link" _s);
569628 parts.appendVector (partsForItem (item, aggregator));
570629
@@ -581,7 +640,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
581640 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' >' ));
582641 else
583642 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' ' , makeStringByJoining (attributes, " " _s), ' >' ));
584- } else {
643+ } else if (!aggregator. useMarkdownOutput ()) {
585644 parts.append (" scrollable" _s);
586645 parts.appendVector (partsForItem (item, aggregator));
587646 parts.append (makeString (" contentSize=[" _s, scrollableData.contentSize .width (), ' x' , scrollableData.contentSize .height (), ' ]' ));
@@ -603,7 +662,7 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
603662 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' >' ));
604663 else
605664 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' ' , makeStringByJoining (attributes, " " _s), ' >' ));
606- } else {
665+ } else if (!aggregator. useMarkdownOutput ()) {
607666 parts.append (" select" _s);
608667 parts.appendVector (partsForItem (item, aggregator));
609668
@@ -634,6 +693,13 @@ static void addPartsForItem(const TextExtraction::Item& item, std::optional<Node
634693 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' >' ));
635694 else
636695 parts.append (makeString (' <' , item.nodeName .convertToASCIILowercase (), ' ' , makeStringByJoining (attributes, " " _s), ' >' ));
696+ } else if (aggregator.useMarkdownOutput ()) {
697+ String imageSource;
698+ if (auto attributeFromClient = item.clientAttributes .get (" src" _s); !attributeFromClient.isEmpty ())
699+ imageSource = WTFMove (attributeFromClient);
700+ else
701+ imageSource = normalizedURLString (imageData.completedSource );
702+ parts.append (makeString (" , ' )' ));
637703 } else {
638704 parts.append (" image" _s);
639705 parts.appendVector (partsForItem (item, aggregator));
@@ -685,6 +751,22 @@ static void addTextRepresentationRecursive(const TextExtraction::Item& item, std
685751 return ;
686752 }
687753
754+ bool isLink = false ;
755+ if (auto link = item.dataAs <TextExtraction::LinkItemData>()) {
756+ String linkURLString;
757+ if (auto attributeFromClient = item.clientAttributes .get (" href" _s); !attributeFromClient.isEmpty ())
758+ linkURLString = WTFMove (attributeFromClient);
759+ else
760+ linkURLString = normalizedURLString (link->completedURL );
761+ aggregator.pushURLString (WTFMove (linkURLString));
762+ isLink = true ;
763+ }
764+
765+ auto popURLScope = makeScopeExit ([isLink, &aggregator] {
766+ if (isLink)
767+ aggregator.popURLString ();
768+ });
769+
688770 TextExtractionLine line { aggregator.advanceToNextLine (), depth };
689771 addPartsForItem (item, std::optional { identifier }, line, aggregator);
690772
0 commit comments