123456789_123456789_123456789_123456789_123456789_

Class: LibXML::XML::HTMLParser::Context

Relationships & Source Files
Super Chains via Extension / Inclusion / Inheritance
Class Chain:
Instance Chain:
Inherits: LibXML::XML::Parser::Context
Defined in: ext/libxml/ruby_xml_html_parser_context.c,
ext/libxml/ruby_xml_html_parser_context.c

Overview

The Context class provides in-depth control over how a document is parsed.

Class Method Summary

::LibXML::XML::Parser::Context - Inherited

.document

Creates a new parser context based on the specified document.

.file

Creates a new parser context based on the specified file or uri.

.io

Creates a new parser context based on the specified io object.

.string

Creates a new parser context based on the specified string.

Instance Attribute Summary

::LibXML::XML::Parser::Context - Inherited

#base_uri

Obtain the base url for this parser context.

#base_uri=

Sets the base url for this parser context.

#disable_cdata=

Control whether CDATA nodes will be created in this context.

#disable_cdata?

Determine whether CDATA nodes will be created in this context.

#disable_sax?

Determine whether SAX-based processing is disabled in this context.

#docbook?

Determine whether this is a docbook context.

#encoding

Obtain the character encoding identifier used in this context.

#encoding=

Sets the character encoding for this context.

#html?

Determine whether this is an html context.

#keep_blanks?

Determine whether parsers in this context retain whitespace.

#options

Returns the parser options for this context.

#options=

Provides control over the execution of a parser.

#recovery=

Control whether recovery mode is enabled in this context.

#recovery?

Determine whether recovery mode is enabled in this context.

#replace_entities=

Control whether external entity replacement is enabled in this context.

#replace_entities?

Determine whether external entity replacement is enabled in this context.

#standalone?

Determine whether this is a standalone context.

#stats?

Determine whether this context maintains statistics.

#subset_external?

Determine whether this context is a subset of an external context.

#subset_internal?

Determine whether this context is a subset of an internal context.

#validate?

Determine whether validation is enabled in this context.

#well_formed?

Determine whether this context contains well-formed ::LibXML::XML.

Instance Method Summary

::LibXML::XML::Parser::Context - Inherited

#close

Closes the underlying input streams.

#data_directory

Obtain the data directory associated with this context.

#depth

Obtain the depth of this context.

#errno

Obtain the last-error number in this context.

#io_max_num_streams

Obtain the limit on the number of IO streams opened in this context.

#io_num_streams

Obtain the actual number of IO streams in this context.

#name_depth

Obtain the name depth for this context.

#name_depth_max

Obtain the maximum name depth for this context.

#name_node

Obtain the name node for this context.

#name_tab

Obtain the name table for this context.

#node

Obtain the root node of this context.

#node_depth

Obtain the node depth for this context.

#node_depth_max

Obtain the maximum node depth for this context.

#num_chars

Obtain the number of characters in this context.

#space_depth

Obtain the space depth for this context.

#space_depth_max

Obtain the maximum space depth for this context.

#subset_external_system_id

Obtain this context’s external subset system identifier.

#subset_external_uri

Obtain this context’s external subset URI.

#subset_internal_name

Obtain this context’s subset name (valid only if either of subset_external? or subset_internal? is true).

#valid

Determine whether this context is valid.

#version

Obtain this context’s version identifier.

Class Method Details

XML::HTMLParser::Context.file(file) ⇒ Context

Creates a new parser context based on the specified file or uri.

Parameters:

file - A filename or uri
options - A or'ed together list of LibXML::XML::HTMLParser::Options values
[ GitHub ]

  
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 152

static VALUE rxml_html_parser_context_file(int argc, VALUE* argv, VALUE klass)
{
  VALUE file, options;
  rb_scan_args(argc, argv, "11", &file, &options);

  htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(StringValuePtr(file), NULL);
  if (!ctxt)
    rxml_raise(xmlGetLastError());

  /* This is annoying, but xmlInitParserCtxt (called indirectly above) and 
     xmlCtxtUseOptionsInternal (called below) initialize slightly different
     context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
     sets to 0 and xmlCtxtUseOptionsInternal sets to 1.  So we have to call both. */
  htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));

  return rxml_html_parser_context_wrap(ctxt);
}

XML::HTMLParser::Context.io(io) ⇒ Context

Creates a new parser context based on the specified io object.

Parameters:

io - A ruby IO object
options - A or'ed together list of LibXML::XML::HTMLParser::Options values
[ GitHub ]

  
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 180

static VALUE rxml_html_parser_context_io(int argc, VALUE* argv, VALUE klass)
{
  VALUE io, options;
  rb_scan_args(argc, argv, "11", &io, &options);

  VALUE result;
  htmlParserCtxtPtr ctxt;
  xmlParserInputBufferPtr input;
  xmlParserInputPtr stream;

  if (NIL_P(io))
    rb_raise(rb_eTypeError, "Must pass in an IO object");

  input = xmlParserInputBufferCreateIO((xmlInputReadCallback) rxml_read_callback, NULL,
                                     (void*)io, XML_CHAR_ENCODING_NONE);

  ctxt = htmlNewParserCtxt();
  if (!ctxt)
  {
    xmlFreeParserInputBuffer(input);
    rxml_raise(xmlGetLastError());
  }

  /* This is annoying, but xmlInitParserCtxt (called indirectly above) and 
     xmlCtxtUseOptionsInternal (called below) initialize slightly different
     context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
     sets to 0 and xmlCtxtUseOptionsInternal sets to 1.  So we have to call both. */
  htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));

  stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

  if (!stream)
  {
    xmlFreeParserInputBuffer(input);
    xmlFreeParserCtxt(ctxt);
    rxml_raise(xmlGetLastError());
  }
  inputPush(ctxt, stream);
  result = rxml_html_parser_context_wrap(ctxt);

  /* Attach io object to parser so it won't get freed.*/
  rb_ivar_set(result, IO_ATTR, io);

  return result;
}

XML::HTMLParser::Context.string(string) ⇒ Context

Creates a new parser context based on the specified string.

Parameters:

string - A string that contains the data to parse
options - A or'ed together list of LibXML::XML::HTMLParser::Options values
[ GitHub ]

  
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 236

static VALUE rxml_html_parser_context_string(int argc, VALUE* argv, VALUE klass)
{
  VALUE string, options;
  rb_scan_args(argc, argv, "11", &string, &options);

  Check_Type(string, T_STRING);

  if (RSTRING_LEN(string) == 0)
    rb_raise(rb_eArgError, "Must specify a string with one or more characters");

  htmlParserCtxtPtr ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(string),
                                   (int)RSTRING_LEN(string));
  if (!ctxt)
    rxml_raise(xmlGetLastError());

  /* This is annoying, but xmlInitParserCtxt (called indirectly above) and 
     xmlCtxtUseOptionsInternal (called below) initialize slightly different
     context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
     sets to 0 and xmlCtxtUseOptionsInternal sets to 1.  So we have to call both. */
  htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));

  // Setup sax handler
  // TODO - there must be a better way? The sax handler is initialized for XML, but we want
  // to use HTML
  memset(ctxt->sax, 0, sizeof(xmlSAXHandler));
  xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
  
  return rxml_html_parser_context_wrap(ctxt);
}

Instance Attribute Details

#disable_cdata=(true|false) (writeonly)

Control whether the CDATA nodes will be created in this context.

[ GitHub ]

  
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 293

static VALUE rxml_html_parser_context_disable_cdata_set(VALUE self, VALUE value)
{
  htmlParserCtxtPtr ctxt;
  Data_Get_Struct(self, htmlParserCtxt, ctxt);

  if (ctxt->sax == NULL)
    rb_raise(rb_eRuntimeError, "Sax handler is not yet set");

  /* LibXML controls this internally with the default SAX handler. */ 
  if (value)
    ctxt->sax->cdataBlock = NULL;
  else
    ctxt->sax->cdataBlock = xmlSAX2CDataBlock;

  return value;
}

#options=(XML::Parser::Options::NOENT |) (writeonly) XML::Parser::Options::NOCDATA

Provides control over the execution of a parser. Valid values are the constants defined on ::LibXML::XML::Parser::Options. Multiple options can be combined by using Bitwise OR (|).

[ GitHub ]

  
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 319

static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options)
{
  int xml_options = NUM2INT(options);
  htmlParserCtxtPtr ctxt;
  Check_Type(options, T_FIXNUM);

  Data_Get_Struct(self, htmlParserCtxt, ctxt);
  htmlCtxtUseOptions(ctxt, xml_options);

#if LIBXML_VERSION >= 20707
  /* Big hack here, but htmlCtxtUseOptions doens't support HTML_PARSE_NOIMPLIED.
     So do it ourselves. There must be a better way??? */
  if (xml_options & HTML_PARSE_NOIMPLIED) 
  {
	  ctxt->options |= HTML_PARSE_NOIMPLIED;
  }
#endif

  return self;
}

Instance Method Details

#closenil

Closes the underlying input streams. This is useful when parsing a large amount of files and you want to close the files without relying on Ruby’s garbage collector to run.

[ GitHub ]

  
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 274

static VALUE rxml_html_parser_context_close(VALUE self)
{
  htmlParserCtxtPtr ctxt;
  xmlParserInputPtr xinput;
  Data_Get_Struct(self, htmlParserCtxt, ctxt);

  while ((xinput = inputPop(ctxt)) != NULL)
  {
	 xmlFreeInputStream(xinput);
  }
  return Qnil;
}