123456789_123456789_123456789_123456789_123456789_

Class: Nokogiri::XML::Reader

Relationships & Source Files
Super Chains via Extension / Inclusion / Inheritance
Instance Chain:
self, Enumerable
Inherits: Object
Defined in: lib/nokogiri/xml/reader.rb,
ext/nokogiri/xml_node.c,
ext/nokogiri/xml_reader.c

Overview

The Reader parser allows you to effectively pull parse an XML document. Once instantiated, call #each to iterate over each node.

Reader parses an XML document similar to the way a cursor would move. The Reader is given an XML document, and yields nodes to an each block.

The Reader parser might be good for when you need the speed and low memory usage of a SAX parser, but do not want to write a SAX::Document handler.

Here is an example of usage:

reader = Nokogiri::XML::Reader.new <<~XML
  <x xmlns:tenderlove='http://tenderlovemaking.com/'>
    <tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
  </x>
XML

reader.each do |node|
  # node is an instance of Nokogiri::XML::Reader
  puts node.name
end

#each can only be called once! Once the cursor moves through the entire document, you must parse the document again. It may be better to capture all information you need during a single iteration.

⚠ libxml2 does not support error recovery in the Reader parser. The RECOVER ParseOption is ignored. If a syntax error is encountered during parsing, an exception will be raised.

Constant Summary

Class Method Summary

Instance Attribute Summary

Instance Method Summary

Constructor Details

.new(input) { |options| ... } → Reader) ⇒ Reader .new(input, url:, encoding:, options:) { |options| ... } → Reader) ⇒ Reader

Create a new Reader to parse an XML document.

Required Parameters
  • input (String | IO): The XML document to parse.

Optional Parameters
  • url: (String) The base URL of the document.

  • encoding: (String) The name of the encoding of the document.

  • options: (Integer | ParseOptions) Options to control the parser behavior. Defaults to ParseOptions::STRICT.

Yields

If present, the block will be passed a ParseOptions object to modify before the fragment is parsed. See ParseOptions for more information.

Yields:

  • (options)
[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 99

def self.new(
  string_or_io,
  url_ = nil, encoding_ = nil, options_ = ParseOptions::STRICT,
  url: url_, encoding: encoding_, options: options_
)
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
  yield options if block_given?

  if string_or_io.respond_to?(:read)
    return Reader.from_io(string_or_io, url, encoding, options.to_i)
  end

  Reader.from_memory(string_or_io, url, encoding, options.to_i)
end

Class Method Details

.from_io(io, url = nil, encoding = nil, options = 0)

Create a new Reader to parse an IO stream.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 660

static VALUE
from_io(int argc, VALUE *argv, VALUE klass)
{
  /* TODO: deprecate this method, since Reader.new can handle both memory and IO. It can then
   * become private. */
  VALUE rb_io, rb_url, encoding, rb_options;
  xmlTextReaderPtr reader;
  const char *c_url      = NULL;
  const char *c_encoding = NULL;
  int c_options           = 0;
  VALUE rb_reader, args[3];

  rb_scan_args(argc, argv, "13", &rb_io, &rb_url, &encoding, &rb_options);

  if (!RTEST(rb_io)) { rb_raise(rb_eArgError, "io cannot be nil"); }
  if (RTEST(rb_url)) { c_url = StringValueCStr(rb_url); }
  if (RTEST(encoding)) { c_encoding = StringValueCStr(encoding); }
  if (RTEST(rb_options)) { c_options = (int)NUM2INT(rb_options); }

  reader = xmlReaderForIO(
             (xmlInputReadCallback)noko_io_read,
             (xmlInputCloseCallback)noko_io_close,
             (void *)rb_io,
             c_url,
             c_encoding,
             c_options
           );

  if (reader == NULL) {
    xmlFreeTextReader(reader);
    rb_raise(rb_eRuntimeError, "couldn't create a parser");
  }

  rb_reader = TypedData_Wrap_Struct(klass, &xml_text_reader_type, reader);
  args[0] = rb_io;
  args[1] = rb_url;
  args[2] = encoding;
  rb_obj_call_init(rb_reader, 3, args);

  return rb_reader;
}

.from_memory(string, url = nil, encoding = nil, options = 0)

Create a new Reader to parse a String.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 613

static VALUE
from_memory(int argc, VALUE *argv, VALUE klass)
{
  /* TODO: deprecate this method, since Reader.new can handle both memory and IO. It can then
   * become private. */
  VALUE rb_buffer, rb_url, encoding, rb_options;
  xmlTextReaderPtr reader;
  const char *c_url      = NULL;
  const char *c_encoding = NULL;
  int c_options           = 0;
  VALUE rb_reader, args[3];

  rb_scan_args(argc, argv, "13", &rb_buffer, &rb_url, &encoding, &rb_options);

  if (!RTEST(rb_buffer)) { rb_raise(rb_eArgError, "string cannot be nil"); }
  if (RTEST(rb_url)) { c_url = StringValueCStr(rb_url); }
  if (RTEST(encoding)) { c_encoding = StringValueCStr(encoding); }
  if (RTEST(rb_options)) { c_options = (int)NUM2INT(rb_options); }

  reader = xmlReaderForMemory(
             StringValuePtr(rb_buffer),
             (int)RSTRING_LEN(rb_buffer),
             c_url,
             c_encoding,
             c_options
           );

  if (reader == NULL) {
    xmlFreeTextReader(reader);
    rb_raise(rb_eRuntimeError, "couldn't create a parser");
  }

  rb_reader = TypedData_Wrap_Struct(klass, &xml_text_reader_type, reader);
  args[0] = rb_buffer;
  args[1] = rb_url;
  args[2] = encoding;
  rb_obj_call_init(rb_reader, 3, args);

  return rb_reader;
}

Instance Attribute Details

#attributes?Boolean (readonly)

Does this node have attributes?

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 121

static VALUE
attributes_eh(VALUE self)
{
  xmlTextReaderPtr reader;
  int eh;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  eh = has_attributes(reader);
  if (eh == 0) { return Qfalse; }
  if (eh == 1) { return Qtrue; }

  return Qnil;
}

#default?Boolean (readonly)

Was an attribute generated from the default value in the DTD or schema?

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 81

static VALUE
default_eh(VALUE self)
{
  xmlTextReaderPtr reader;
  int eh;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  eh = xmlTextReaderIsDefault(reader);
  if (eh == 0) { return Qfalse; }
  if (eh == 1) { return Qtrue; }

  return Qnil;
}

#empty_element?(#) ⇒ Boolean (readonly) Also known as: #self_closing?

Returns true if the current node is empty, otherwise false.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 708

static VALUE
empty_element_p(VALUE self)
{
  xmlTextReaderPtr reader;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);

  if (xmlTextReaderIsEmptyElement(reader)) {
    return Qtrue;
  }

  return Qfalse;
}

#errors (rw)

A list of errors encountered while parsing

[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 74

attr_accessor :errors

#self_closing? (readonly)

Alias for #empty_element?.

[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 79

alias_method :self_closing?, :empty_element?

#source (readonly)

The XML source

[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 77

attr_reader :source

#value (readonly)

Get the text value of the node if present. Returns a utf-8 encoded string.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 368

static VALUE
value(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *value;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  value = (const char *)xmlTextReaderConstValue(reader);
  if (value == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(value);
}

#value?Boolean (readonly)

Does this node have a text value?

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 101

static VALUE
value_eh(VALUE self)
{
  xmlTextReaderPtr reader;
  int eh;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  eh = xmlTextReaderHasValue(reader);
  if (eh == 0) { return Qfalse; }
  if (eh == 1) { return Qtrue; }

  return Qnil;
}

Instance Method Details

#attribute(name)

Get the value of attribute named #name

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 266

static VALUE
reader_attribute(VALUE self, VALUE name)
{
  xmlTextReaderPtr reader;
  xmlChar *value ;
  VALUE rb_value;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);

  if (NIL_P(name)) { return Qnil; }
  name = StringValue(name) ;

  value = xmlTextReaderGetAttribute(reader, (xmlChar *)StringValueCStr(name));
  if (value == NULL) { return Qnil; }

  rb_value = NOKOGIRI_STR_NEW2(value);
  xmlFree(value);
  return rb_value;
}

#attribute_at(index)

Get the value of attribute at index

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 237

static VALUE
attribute_at(VALUE self, VALUE index)
{
  xmlTextReaderPtr reader;
  xmlChar *value;
  VALUE rb_value;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);

  if (NIL_P(index)) { return Qnil; }
  index = rb_Integer(index);

  value = xmlTextReaderGetAttributeNo(
            reader,
            (int)NUM2INT(index)
          );
  if (value == NULL) { return Qnil; }

  rb_value = NOKOGIRI_STR_NEW2(value);
  xmlFree(value);
  return rb_value;
}

#attribute_count

Get the number of attributes for the current node

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 292

static VALUE
attribute_count(VALUE self)
{
  xmlTextReaderPtr reader;
  int count;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  count = xmlTextReaderAttributeCount(reader);
  if (count == -1) { return Qnil; }

  return INT2NUM(count);
}

#attribute_hash() → Hash<String ⇒ String>)

Get the attributes of the current node as a Hash of names and values.

See related: #attributes and #namespaces

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 182

static VALUE
rb_xml_reader_attribute_hash(VALUE rb_reader)
{
  VALUE rb_attributes = rb_hash_new();
  xmlTextReaderPtr c_reader;
  xmlNodePtr c_node;
  xmlAttrPtr c_property;
  VALUE rb_errors;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_text_reader_type, c_reader);

  if (!has_attributes(c_reader)) {
    return rb_attributes;
  }

  rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0);

  xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher);
  c_node = xmlTextReaderExpand(c_reader);
  xmlSetStructuredErrorFunc(NULL, NULL);

  if (c_node == NULL) {
    if (RARRAY_LEN(rb_errors) > 0) {
      VALUE rb_error = rb_ary_entry(rb_errors, 0);
      VALUE exception_message = rb_funcall(rb_error, rb_intern("to_s"), 0);
      rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
    }
    return Qnil;
  }

  c_property = c_node->properties;
  while (c_property != NULL) {
    VALUE rb_name = NOKOGIRI_STR_NEW2(c_property->name);
    VALUE rb_value = Qnil;
    xmlChar *c_value = xmlNodeGetContent((xmlNode *)c_property);

    if (c_value) {
      rb_value = NOKOGIRI_STR_NEW2(c_value);
      xmlFree(c_value);
    }

    rb_hash_aset(rb_attributes, rb_name, rb_value);

    c_property = c_property->next;
  }

  return rb_attributes;
}

#attributes (readonly)

Get the attributes and namespaces of the current node as a Hash.

This is the union of #attribute_hash and #namespaces

Returns

(Hash<String, String>) Attribute names and values, and namespace prefixes and hrefs.

[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 126

def attributes
  attribute_hash.merge(namespaces)
end

#base_uri

Get the xml:base of the node

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 463

static VALUE
rb_xml_reader_base_uri(VALUE rb_reader)
{
  VALUE rb_base_uri;
  xmlTextReaderPtr c_reader;
  xmlChar *c_base_uri;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_text_reader_type, c_reader);

  c_base_uri = xmlTextReaderBaseUri(c_reader);
  if (c_base_uri == NULL) {
    return Qnil;
  }

  rb_base_uri = NOKOGIRI_STR_NEW2(c_base_uri);
  xmlFree(c_base_uri);

  return rb_base_uri;
}

#depth

Get the depth of the node

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 311

static VALUE
depth(VALUE self)
{
  xmlTextReaderPtr reader;
  int depth;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  depth = xmlTextReaderDepth(reader);
  if (depth == -1) { return Qnil; }

  return INT2NUM(depth);
}

#each

Move the cursor through the document yielding the cursor to the block

[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 132

def each
  while (cursor = read)
    yield cursor
  end
end

#encoding

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 722

static VALUE
rb_xml_reader_encoding(VALUE rb_reader)
{
  xmlTextReaderPtr c_reader;
  const char *parser_encoding;
  VALUE constructor_encoding;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_text_reader_type, c_reader);
  parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
  if (parser_encoding) {
    return NOKOGIRI_STR_NEW2(parser_encoding);
  }

  constructor_encoding = rb_iv_get(rb_reader, "@encoding");
  if (RTEST(constructor_encoding)) {
    return constructor_encoding;
  }

  return Qnil;
}

#initialize(source, url = nil, encoding = nil) ⇒ Reader (private)

This method is for internal use only.

Returns:

  • (Reader)

    a new instance of Reader

[ GitHub ]

  
# File 'lib/nokogiri/xml/reader.rb', line 114

private def initialize(source, url = nil, encoding = nil) # :nodoc:
  @source   = source
  @errors   = []
  @encoding = encoding
end

#inner_xml

Read the contents of the current node, including child nodes and markup. Returns a utf-8 encoded string.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 562

static VALUE
inner_xml(VALUE self)
{
  xmlTextReaderPtr reader;
  xmlChar *value;
  VALUE str;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);

  value = xmlTextReaderReadInnerXml(reader);

  str = Qnil;
  if (value) {
    str = NOKOGIRI_STR_NEW2((char *)value);
    xmlFree(value);
  }

  return str;
}

#lang

Get the xml:lang scope within which the node resides.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 349

static VALUE
lang(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *lang;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  lang = (const char *)xmlTextReaderConstXmlLang(reader);
  if (lang == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(lang);
}

#local_name

Get the local name of the node

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 425

static VALUE
local_name(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *name;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  name = (const char *)xmlTextReaderConstLocalName(reader);
  if (name == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(name);
}

#name

Get the name of the node. Returns a utf-8 encoded string.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 444

static VALUE
name(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *name;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  name = (const char *)xmlTextReaderConstName(reader);
  if (name == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(name);
}

#namespace_uri

Get the URI defining the namespace associated with the node

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 406

static VALUE
namespace_uri(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *uri;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  uri = (const char *)xmlTextReaderConstNamespaceUri(reader);
  if (uri == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(uri);
}

#namespaces

Get a hash of namespaces for this Node

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 141

static VALUE
rb_xml_reader_namespaces(VALUE rb_reader)
{
  VALUE rb_namespaces = rb_hash_new() ;
  xmlTextReaderPtr c_reader;
  xmlNodePtr c_node;
  VALUE rb_errors;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_text_reader_type, c_reader);

  if (! has_attributes(c_reader)) {
    return rb_namespaces ;
  }

  rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0);

  xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher);
  c_node = xmlTextReaderExpand(c_reader);
  xmlSetStructuredErrorFunc(NULL, NULL);

  if (c_node == NULL) {
    if (RARRAY_LEN(rb_errors) > 0) {
      VALUE rb_error = rb_ary_entry(rb_errors, 0);
      VALUE exception_message = rb_funcall(rb_error, rb_intern("to_s"), 0);
      rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
    }
    return Qnil;
  }

  Nokogiri_xml_node_namespaces(c_node, rb_namespaces);

  return rb_namespaces ;
}

#node_type

Alias for Node#node_type.

#outer_xml

Read the current node and its contents, including child nodes and markup. Returns a utf-8 encoded string.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 589

static VALUE
outer_xml(VALUE self)
{
  xmlTextReaderPtr reader;
  xmlChar *value;
  VALUE str = Qnil;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);

  value = xmlTextReaderReadOuterXml(reader);

  if (value) {
    str = NOKOGIRI_STR_NEW2((char *)value);
    xmlFree(value);
  }
  return str;
}

#prefix

Get the shorthand reference to the namespace associated with the node.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 387

static VALUE
prefix(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *prefix;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  prefix = (const char *)xmlTextReaderConstPrefix(reader);
  if (prefix == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(prefix);
}

#read

Move the Reader forward through the ::Nokogiri::XML document.

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 517

static VALUE
read_more(VALUE rb_reader)
{
  xmlTextReaderPtr c_reader;
  libxmlStructuredErrorHandlerState handler_state;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_text_reader_type, c_reader);

  VALUE rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0);
  noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher);

  int status = xmlTextReaderRead(c_reader);

  noko__structured_error_func_restore(&handler_state);

  xmlDocPtr c_document = xmlTextReaderCurrentDoc(c_reader);
  if (c_document && c_document->encoding == NULL) {
    VALUE constructor_encoding = rb_iv_get(rb_reader, "@encoding");
    if (RTEST(constructor_encoding)) {
      c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
    } else {
      rb_iv_set(rb_reader, "@encoding", NOKOGIRI_STR_NEW2("UTF-8"));
      c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
    }
  }

  if (status == 1) { return rb_reader; }
  if (status == 0) { return Qnil; }

  /* if we're here, there was an error */
  VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
  if (RB_TEST(exception)) {
    rb_exc_raise(exception);
  } else {
    rb_raise(rb_eRuntimeError, "Error pulling: %d", status);
  }
}

#state

Get the state of the reader

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 489

static VALUE
state(VALUE self)
{
  xmlTextReaderPtr reader;
  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  return INT2NUM(xmlTextReaderReadState(reader));
}

#xml_version

Get the ::Nokogiri::XML version of the document being read

[ GitHub ]

  
# File 'ext/nokogiri/xml_reader.c', line 330

static VALUE
xml_version(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *version;

  TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader);
  version = (const char *)xmlTextReaderConstXmlVersion(reader);
  if (version == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(version);
}