Class: REXML::Parsers::BaseParser
Relationships & Source Files | |
Inherits: | Object |
Defined in: | lib/rexml/parsers/baseparser.rb |
Overview
Using the Pull Parser
This API is experimental, and subject to change.
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
res = parser.next
puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end
See the PullEvent
class for information on the content of the results. The data is identical to the arguments passed for the various events to the ::REXML::StreamListener
API.
Notice that:
parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
res = parser.next
raise res[1] if res.error?
end
Nat Price gave me some good ideas for the API.
Constant Summary
-
ATTDEF =
# File 'lib/rexml/parsers/baseparser.rb', line 80"\\s#{NAME}\\s#{ATTTYPE}\\s+#{DEFAULTDECL}"
-
ATTDEF_RE =
# File 'lib/rexml/parsers/baseparser.rb', line 81/#{ATTDEF}/
-
ATTLISTDECL_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 83/\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
-
ATTLISTDECL_START =
# File 'lib/rexml/parsers/baseparser.rb', line 82/\A\s*<!ATTLIST/um
-
ATTRIBUTE_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 53/\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
-
ATTTYPE =
# File 'lib/rexml/parsers/baseparser.rb', line 77"(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
-
ATTVALUE =
# File 'lib/rexml/parsers/baseparser.rb', line 78"(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
-
CDATA_END =
# File 'lib/rexml/parsers/baseparser.rb', line 57/\A\s*\]\s*>/um
-
CDATA_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 58/<!\[CDATA\[(.*?)\]\]>/um
-
CDATA_START =
# File 'lib/rexml/parsers/baseparser.rb', line 56/\A<!\[CDATA\[/u
-
CLOSE_MATCH =
# File 'lib/rexml/parsers/baseparser.rb', line 64/\A\s*<\/(#{QNAME_STR})\s*>/um
-
COMBININGCHAR =
TODO
''
-
COMMENT_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 55/<!--(.*?)-->/um
-
COMMENT_START =
# File 'lib/rexml/parsers/baseparser.rb', line 54/\A<!--/u
-
DEFAULTDECL =
# File 'lib/rexml/parsers/baseparser.rb', line 79"(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
-
DEFAULT_ENTITIES =
# File 'lib/rexml/parsers/baseparser.rb', line 108{ 'gt' => [/>/, '>', '>', />/], 'lt' => [/</, '<', '<', /</], 'quot' => [/"/, '"', '"', /"/], "apos" => [/'/, "'", "'", /'/] }
-
DIGIT =
# File 'lib/rexml/parsers/baseparser.rb', line 31'[:digit:]'
-
DOCTYPE_END =
# File 'lib/rexml/parsers/baseparser.rb', line 52/\A\s*\]\s*>/um
-
DOCTYPE_START =
# File 'lib/rexml/parsers/baseparser.rb', line 51/\A\s*<!DOCTYPE\s/um
-
ELEMENTDECL_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 72/\A\s*(<!ELEMENT.*?)>/um
-
ELEMENTDECL_START =
# File 'lib/rexml/parsers/baseparser.rb', line 71/\A\s*<!ELEMENT/um
-
ENCODING =
# File 'lib/rexml/parsers/baseparser.rb', line 67/\bencoding\s*=\s*["'](.*?)['"]/um
-
ENTITYDECL =
# File 'lib/rexml/parsers/baseparser.rb', line 99/\s*(?:#{GEDECL})|(?:#{PEDECL})/um
-
ENTITYDEF =
# File 'lib/rexml/parsers/baseparser.rb', line 96"(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
-
ENTITYVALUE =
# File 'lib/rexml/parsers/baseparser.rb', line 94%Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
-
ENTITY_START =
# File 'lib/rexml/parsers/baseparser.rb', line 70/\A\s*<!ENTITY/
-
ENUMERATEDTYPE =
# File 'lib/rexml/parsers/baseparser.rb', line 76"(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
-
ENUMERATION =
# File 'lib/rexml/parsers/baseparser.rb', line 74"\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
-
EREFERENCE =
# File 'lib/rexml/parsers/baseparser.rb', line 106/&(?!#{NAME};)/
-
EXTENDER =
TODO
''
-
EXTERNALID =
# File 'lib/rexml/parsers/baseparser.rb', line 91"(?:(?:(SYSTEM)\\s#{SYSTEMLITERAL})|(?:(PUBLIC)\\s#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
-
EXTERNAL_ID_PUBLIC =
# File 'lib/rexml/parsers/baseparser.rb', line 102/\A\s*PUBLIC\s#{PUBIDLITERAL}\s#{SYSTEMLITERAL}\s*/um
-
EXTERNAL_ID_SYSTEM =
# File 'lib/rexml/parsers/baseparser.rb', line 103/\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
-
GEDECL =
# File 'lib/rexml/parsers/baseparser.rb', line 98"<!ENTITY\\s#{NAME}\\s#{ENTITYDEF}\\s*>"
-
INSTRUCTION_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 62/<\?#{NAME}(\s+.*?)?\?>/um
-
INSTRUCTION_START =
# File 'lib/rexml/parsers/baseparser.rb', line 61/\A<\?/u
-
LETTER =
# File 'lib/rexml/parsers/baseparser.rb', line 30'[:alpha:]'
-
NAME =
# File 'lib/rexml/parsers/baseparser.rb', line 45"([\\w:]#{NAMECHAR}*)"
-
NAMECHAR =
# File 'lib/rexml/parsers/baseparser.rb', line 44'[\-\w\.:]'
-
NCNAME_STR =
# File 'lib/rexml/parsers/baseparser.rb', line 36"[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*"
-
NDATADECL =
# File 'lib/rexml/parsers/baseparser.rb', line 92"\\sNDATA\\s#{NAME}"
-
NMTOKEN =
# File 'lib/rexml/parsers/baseparser.rb', line 46"(?:#{NAMECHAR})+"
-
NMTOKENS =
# File 'lib/rexml/parsers/baseparser.rb', line 47"#{NMTOKEN}(\\s+#{NMTOKEN})*"
-
NOTATIONDECL_START =
# File 'lib/rexml/parsers/baseparser.rb', line 101/\A\s*<!NOTATION/um
-
NOTATIONTYPE =
# File 'lib/rexml/parsers/baseparser.rb', line 75"NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
-
PEDECL =
# File 'lib/rexml/parsers/baseparser.rb', line 97"<!ENTITY\\s(%)\\s#{NAME}\\s+#{PEDEF}\\s*>"
-
PEDEF =
# File 'lib/rexml/parsers/baseparser.rb', line 95"(?:#{ENTITYVALUE}|#{EXTERNALID})"
-
PEREFERENCE =
# File 'lib/rexml/parsers/baseparser.rb', line 93"%#{NAME};"
-
PUBIDCHAR =
::REXML::Entity
constants"\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
-
PUBIDLITERAL =
# File 'lib/rexml/parsers/baseparser.rb', line 90%Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
-
PUBLIC_ID =
# File 'lib/rexml/parsers/baseparser.rb', line 104/\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
-
QNAME =
# File 'lib/rexml/parsers/baseparser.rb', line 38/(#{QNAME_STR})/
-
QNAME_STR =
# File 'lib/rexml/parsers/baseparser.rb', line 37"(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
-
REFERENCE =
# File 'lib/rexml/parsers/baseparser.rb', line 48"&(?:#{NAME};|#\\d;|#x[0-9a-fA-F];)"
-
REFERENCE_RE =
# File 'lib/rexml/parsers/baseparser.rb', line 49/#{REFERENCE}/
-
STANDALONE =
# File 'lib/rexml/parsers/baseparser.rb', line 68/\bstandalone\s*=\s*["'](.*?)['"]/um
-
SYSTEMENTITY =
# File 'lib/rexml/parsers/baseparser.rb', line 73/\A\s*(%.*?;)\s*$/um
-
SYSTEMLITERAL =
# File 'lib/rexml/parsers/baseparser.rb', line 89%Q{((?:"[^"]*")|(?:'[^']*'))}
-
TAG_MATCH =
# File 'lib/rexml/parsers/baseparser.rb', line 63/\A<((?>#{QNAME_STR}))/um
-
TEXT_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 85/\A([^<]*)/um
-
UNAME_STR =
Just for backward compatibility. For example, kramdown uses this. It’s not used in
::REXML
."(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
-
VERSION =
# File 'lib/rexml/parsers/baseparser.rb', line 66/\bversion\s*=\s*["'](.*?)['"]/um
-
XMLDECL_PATTERN =
# File 'lib/rexml/parsers/baseparser.rb', line 60/<\?xml\s+(.*?)\?>/um
-
XMLDECL_START =
# File 'lib/rexml/parsers/baseparser.rb', line 59/\A<\?xml\s/u
Class Method Summary
- .new(source) ⇒ BaseParser constructor
Instance Attribute Summary
-
#empty? ⇒ Boolean
readonly
Returns true if there are no more events.
-
#has_next? ⇒ Boolean
readonly
Returns true if there are more events.
- #source readonly
- #stream=(source) writeonly
Instance Method Summary
- #add_listener(listener)
- #entity(reference, entities)
-
#normalize(input, entities = nil, entity_filter = nil)
Escapes all possible entities.
-
#peek(depth = 0)
Peek at the
depth
event in the stack. - #position
-
#pull
Returns the next event.
-
#unnormalize(string, entities = nil, filter = nil)
Unescapes all possible entities.
-
#unshift(token)
Push an event back on the head of the stream.
- #need_source_encoding_update?(xml_declaration_encoding) ⇒ Boolean private
- #parse_attributes(prefixes, curr_ns) private
- #parse_id(base_error_message, accept_external_id:, accept_public_id:) private
- #parse_id_invalid_details(accept_external_id:, accept_public_id:) private
- #parse_name(base_error_message) private
- #process_instruction private
- #pull_event private
Constructor Details
.new(source) ⇒ BaseParser
Instance Attribute Details
#empty? ⇒ Boolean
(readonly)
Returns true if there are no more events
# File 'lib/rexml/parsers/baseparser.rb', line 146
def empty? return (@source.empty? and @stack.empty?) end
#has_next? ⇒ Boolean
(readonly)
Returns true if there are more events. Synonymous with !empty?
#source (readonly)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 124
attr_reader :source
#stream=(source) (writeonly)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 126
def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil @document_status = nil @tags = [] @stack = [] @entities = [] @nsstack = [] end
Instance Method Details
#add_listener(listener)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 120
def add_listener( listener ) @listeners << listener end
#entity(reference, entities)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 438
def entity( reference, entities ) value = nil value = entities[ reference ] if entities if not value value = DEFAULT_ENTITIES[ reference ] value = value[2] if value end unnormalize( value, entities ) if value end
#need_source_encoding_update?(xml_declaration_encoding) ⇒ Boolean
(private)
# File 'lib/rexml/parsers/baseparser.rb', line 495
def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding true end
#normalize(input, entities = nil, entity_filter = nil)
Escapes all possible entities
# File 'lib/rexml/parsers/baseparser.rb', line 449
def normalize( input, entities=nil, entity_filter=nil ) copy = input.clone # Doing it like this rather than in a loop improves the speed copy.gsub!( EREFERENCE, '&' ) entities.each do |key, value| copy.gsub!( value, "&#{key};" ) unless entity_filter and entity_filter.include?(entity) end if entities copy.gsub!( EREFERENCE, '&' ) DEFAULT_ENTITIES.each do |key, value| copy.gsub!( value[3], value[1] ) end copy end
#parse_attributes(prefixes, curr_ns) (private)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 589
def parse_attributes(prefixes, curr_ns) attributes = {} closed = false match_data = @source.match(/^(.*?)(\/)?>/um, true) if match_data.nil? = "Start tag isn't ended" raise REXML::ParseException.new(, @source) end raw_attributes = match_data[1] closed = !match_data[2].nil? return attributes, closed if raw_attributes.nil? return attributes, closed if raw_attributes.empty? scanner = StringScanner.new(raw_attributes) until scanner.eos? if scanner.scan(/\s+/) break if scanner.eos? end pos = scanner.pos loop do break if scanner.scan(ATTRIBUTE_PATTERN) unless scanner.scan(QNAME) = "Invalid attribute name: <#{scanner.rest}>" raise REXML::ParseException.new(, @source) end name = scanner[0] unless scanner.scan(/\s*=\s*/um) = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(, @source) end quote = scanner.scan(/['"]/) unless quote = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(, @source) end unless scanner.scan(/.*#{Regexp.escape(quote)}/um) match_data = @source.match(/^(.*?)(\/)?>/um, true) if match_data scanner << "/" if closed scanner << ">" scanner << match_data[1] scanner.pos = pos closed = !match_data[2].nil? next end = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(, @source) end end name = scanner[1] prefix = scanner[2] local_part = scanner[3] # quote = scanner[4] value = scanner[5] if prefix == "xmlns" if local_part == "xml" if value != "http://www.w3.org/XML/1998/namespace" msg = "The 'xml' prefix must not be bound to any other namespace "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self ) end elsif local_part == "xmlns" msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self) end curr_ns << local_part elsif prefix prefixes << prefix unless prefix == "xml" end if attributes.has_key?(name) msg = "Duplicate attribute #{name.inspect}" raise REXML::ParseException.new(msg, @source, self) end attributes[name] = value end return attributes, closed end
#parse_id(base_error_message, accept_external_id:, accept_public_id:) (private)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 514
def parse_id(, accept_external_id:, accept_public_id:) if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) pubid = system = nil pubid_literal = md[1] pubid = pubid_literal[1..-2] if pubid_literal # Remove quote system_literal = md[2] system = system_literal[1..-2] if system_literal # Remove quote ["PUBLIC", pubid, system] elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) pubid = system = nil pubid_literal = md[1] pubid = pubid_literal[1..-2] if pubid_literal # Remove quote ["PUBLIC", pubid, nil] elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) system = nil system_literal = md[1] system = system_literal[1..-2] if system_literal # Remove quote ["SYSTEM", nil, system] else details = parse_id_invalid_details(accept_external_id: accept_external_id, accept_public_id: accept_public_id) = "#{}: #{details}" raise REXML::ParseException.new(, @source) end end
#parse_id_invalid_details(accept_external_id:, accept_public_id:) (private)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 542
def parse_id_invalid_details(accept_external_id:, accept_public_id:) public = /\A\s*PUBLIC/um system = /\A\s*SYSTEM/um if (accept_external_id or accept_public_id) and @source.match(/#{public}/um) if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um) return "public ID literal is missing" end unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um) return "invalid public ID literal" end if accept_public_id if @source.match(/#{public}\s#{PUBIDLITERAL}\s[^'"]/um) return "system ID literal is missing" end unless @source.match(/#{public}\s#{PUBIDLITERAL}\s#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else "garbage after public ID literal" end elsif accept_external_id and @source.match(/#{system}/um) if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um) return "system literal is missing" end unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um) return "invalid ID type" end "ID type is missing" end end
#parse_name(base_error_message) (private)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 501
def parse_name( ) md = @source.match(/\A\s*#{NAME}/um, true) unless md if @source.match(/\A\s*\S/um) = "#{}: invalid name" else = "#{}: name is missing" end raise REXML::ParseException.new(, @source) end md[1] end
#peek(depth = 0)
Peek at the depth
event in the stack. The first element on the stack is at depth 0. If depth
is -1, will parse to the end of the input stream and return the last event, which is always :end_document
. Be aware that this causes the stream to be parsed up to the depth
event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.
#position
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 136
def position if @source.respond_to? :position @source.position else # FIXME 0 end end
#process_instruction (private)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 580
def process_instruction match_data = @source.match(INSTRUCTION_PATTERN, true) unless match_data = "Invalid processing instruction node" raise REXML::ParseException.new(, @source) end [:processing_instruction, match_data[1], match_data[2]] end
#pull
Returns the next event. This is a PullEvent
object.
# File 'lib/rexml/parsers/baseparser.rb', line 182
def pull pull_event.tap do |event| @listeners.each do |listener| listener.receive event end end end
#pull_event (private)
[ GitHub ]# File 'lib/rexml/parsers/baseparser.rb', line 190
def pull_event if @closed x, @closed = @closed, nil return [ :end_element, x ] end return [ :end_document ] if empty? return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) word = word[1] unless word.nil? #STDERR.puts "WORD = #{word.inspect}" case word when COMMENT_START return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] when XMLDECL_START #STDERR.puts "XMLDECL" results = @source.match( XMLDECL_PATTERN, true )[1] version = VERSION.match( results ) version = version[1] unless version.nil? encoding = ENCODING.match(results) encoding = encoding[1] unless encoding.nil? if need_source_encoding_update?(encoding) @source.encoding = encoding end if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding encoding = "UTF-16" end standalone = STANDALONE.match(results) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] when INSTRUCTION_START return process_instruction when DOCTYPE_START = "Malformed DOCTYPE" @source.match(DOCTYPE_START, true) @nsstack.unshift(curr_ns=Set.new) name = parse_name( ) if @source.match(/\A\s*\[/um, true) id = [nil, nil, nil] @document_status = :in_doctype elsif @source.match(/\A\s*>/um, true) id = [nil, nil, nil] @document_status = :after_doctype else id = parse_id(, accept_external_id: true, accept_public_id: false) if id[0] == "SYSTEM" # For backward compatibility id[1], id[2] = id[2], nil end if @source.match(/\A\s*\[/um, true) @document_status = :in_doctype elsif @source.match(/\A\s*>/um, true) @document_status = :after_doctype else = "#{}: garbage after external ID" raise REXML::ParseException.new(, @source) end end args = [:start_doctype, name, *id] if @document_status == :after_doctype @source.match(/\A\s*/um, true) @stack << [ :end_doctype ] end return args when /\A\s+/ else @document_status = :after_doctype if @source.encoding == "UTF-8" @source.buffer.force_encoding(::Encoding::UTF_8) end end end if @document_status == :in_doctype md = @source.match(/\A\s*(.*?>)/um) case md[1] when SYSTEMENTITY match = @source.match( SYSTEMENTITY, true )[1] return [ :externalentity, match ] when ELEMENTDECL_START return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] when ENTITY_START match = @source.match( ENTITYDECL, true ).to_a.compact match[0] = :entitydecl ref = false if match[1] == '%' ref = true match.delete_at 1 end # Now we have to sort out what kind of entity reference this is if match[2] == 'SYSTEM' # External reference match[3] = match[3][1..-2] # PUBID match.delete_at(4) if match.size > 4 # Chop out NDATA decl # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] elsif match[2] == 'PUBLIC' # External reference match[3] = match[3][1..-2] # PUBID match[4] = match[4][1..-2] # HREF match.delete_at(5) if match.size > 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else match[2] = match[2][1..-2] match.pop if match.size == 4 # match is [ :entity, name, value ] end match << '%' if ref return match when ATTLISTDECL_START md = @source.match( ATTLISTDECL_PATTERN, true ) raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? element = md[1] contents = md[0] pairs = {} values = md[0].scan( ATTDEF_RE ) values.each do |attdef| unless attdef[3] == "#IMPLIED" attdef.compact! val = attdef[3] val = attdef[4] if val == "#FIXED " pairs[attdef[0]] = val if attdef[0] =~ /^xmlns:(.*)/ @nsstack[0] << $1 end end end return [ :attlistdecl, element, pairs, contents ] when NOTATIONDECL_START = "Malformed notation declaration" unless @source.match(/\A\s*<!NOTATION\s+/um, true) if @source.match(/\A\s*<!NOTATION\s*>/um) = "#{}: name is missing" else = "#{}: invalid declaration name" end raise REXML::ParseException.new(, @source) end name = parse_name( ) id = parse_id(, accept_external_id: true, accept_public_id: true) unless @source.match(/\A\s*>/um, true) = "#{}: garbage before end >" raise REXML::ParseException.new(, @source) end return [:notationdecl, name, *id] when DOCTYPE_END @document_status = :after_doctype @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end if @document_status == :after_doctype @source.match(/\A\s*/um, true) end begin @source.read if @source.buffer.size<2 if @source.buffer[0] == ?< if @source.buffer[1] == ?/ @nsstack.shift last_tag = @tags.pop md = @source.match( CLOSE_MATCH, true ) if md and !last_tag = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(, @source) end if md.nil? or last_tag != md[1] = "Missing end tag for '#{last_tag}'" << " (got '#{md[1]}')" if md raise REXML::ParseException.new(, @source) end return [ :end_element, last_tag ] elsif @source.buffer[1] == ?! md = @source.match(/\A(\s*[^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][2] == ?- md = @source.match( COMMENT_PATTERN, true ) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md else md = @source.match( CDATA_PATTERN, true ) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.buffer[1] == ?? return process_instruction else # Get the next tag md = @source.match(TAG_MATCH, true) unless md raise REXML::ParseException.new("malformed XML: missing tag start", @source) end @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) attributes, closed = parse_attributes(prefixes, curr_ns) # Verify that all of the prefixes have been defined for prefix in prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed @closed = md[1] @nsstack.shift else @tags.push( md[1] ) end return [ :start_element, md[1], attributes ] end else md = @source.match( TEXT_PATTERN, true ) if md[0].length == 0 @source.match( /(\s+)/, true ) end #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 #return [ :text, "" ] if md[0].length == 0 # unnormalized = Text::unnormalize( md[1], self ) # return PullEvent.new( :text, md[1], unnormalized ) return [ :text, md[1] ] end rescue REXML::UndefinedNamespaceException raise rescue REXML::ParseException raise rescue => error raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end return [ :dummy ] end
#unnormalize(string, entities = nil, filter = nil)
Unescapes all possible entities
# File 'lib/rexml/parsers/baseparser.rb', line 465
def unnormalize( string, entities=nil, filter=nil ) rv = string.clone rv.gsub!( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d)|(?:x[a-fA-F0-9]));/ ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! if matches.size > 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value re = /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end end rv.gsub!( /&/, '&' ) end rv end
#unshift(token)
Push an event back on the head of the stream. This method has (theoretically) infinite depth.
# File 'lib/rexml/parsers/baseparser.rb', line 157
def unshift token @stack.unshift(token) end