From 33a774f4b5a2cbc76efc492cdf2b07e23e0eeb88 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 13 Jul 2024 08:06:37 +0900 Subject: [PATCH 1/2] fix typo --- lib/rexml/parsers/streamparser.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..fa3ac496 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -36,8 +36,8 @@ def parse @listener.tag_end( event[1] ) @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1] ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype From 13d2c869433e0ddf166ac4370ae7579495a7a323 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 13 Jul 2024 07:49:36 +0900 Subject: [PATCH 2/2] Fixed a problem with sax2 where text was not unnormalized ## Why? :characters is not normalized in sax2. ## Change - text_unnormalized.rb ``` require 'rexml/document' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' xml = < <P> <I> <B> Text </B> </I> EOS class Listener def method_missing(name, *args) p [name, *args] end end puts "REXML(DOM)" REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text} puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? res = parser.pull p res end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse puts "" puts "REXML(SAX)" parser = REXML::Parsers::SAX2Parser.new(xml) parser.listen(Listener.new) parser.parse ``` ## Before (master) ``` $ ruby text_unnormalized.rb REXML(DOM) Text REXML(Pull) start_element: ["root", {}] text: ["\n ", "\n "] start_element: ["A", {}] text: ["<P> <I> <B> Text </B> </I>", "

\r Text "] end_element: ["A"] text: ["\n", "\n"] end_element: ["root"] end_document: [] REXML(Stream) [:tag_start, "root", {}] [:text, "\n "] [:tag_start, "A", {}] [:text, "

\r Text "] [:tag_end, "A"] [:text, "\n"] [:tag_end, "root"] REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "<P>\r <I> <B> Text </B> </I>"] #<= This [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` ## After(This PR) ``` $ ruby text_unnormalized.rb REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "

\r Text "] [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` --- lib/rexml/parsers/sax2parser.rb | 21 ++------------------- test/test_pullparser.rb | 16 ++++++++++++++++ test/test_sax.rb | 11 +++++++++++ 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..36f98c2a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -157,25 +157,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 0aca46be..096e8b7f 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -82,6 +82,22 @@ def test_character_references assert_equal("B", events['b']) end + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + def test_text_content_with_line_breaks source = "AB\nC\r\n" parser = REXML::Parsers::PullParser.new( source ) diff --git a/test/test_sax.rb b/test/test_sax.rb index 8e905f2e..5a3f5e4e 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f )