Class: LibXML::XML::HTMLParser::Context
Relationships & Source Files | |
Super Chains via Extension / Inclusion / Inheritance | |
Class Chain:
|
|
Instance Chain:
|
|
Inherits: |
LibXML::XML::Parser::Context
|
Defined in: | ext/libxml/ruby_xml_html_parser_context.c, ext/libxml/ruby_xml_html_parser_context.c |
Overview
The Context
class provides in-depth control over how a document is parsed.
Class Method Summary
-
XML::HTMLParser::Context.file(file) ⇒ Context
Creates a new parser context based on the specified file or uri.
-
XML::HTMLParser::Context.io(io) ⇒ Context
Creates a new parser context based on the specified io object.
-
XML::HTMLParser::Context.string(string) ⇒ Context
Creates a new parser context based on the specified string.
::LibXML::XML::Parser::Context
- Inherited
Instance Attribute Summary
-
#disable_cdata=(true|false)
writeonly
Control whether the CDATA nodes will be created in this context.
-
#options=(XML::Parser::Options::NOENT |)
writeonly
Provides control over the execution of a parser.
::LibXML::XML::Parser::Context
- Inherited
#base_uri | Obtain the base url for this parser context. |
#base_uri= | Sets the base url for this parser context. |
#disable_cdata= | Control whether CDATA nodes will be created in this context. |
#disable_cdata? | Determine whether CDATA nodes will be created in this context. |
#disable_sax? | Determine whether SAX-based processing is disabled in this context. |
#docbook? | Determine whether this is a docbook context. |
#encoding | Obtain the character encoding identifier used in this context. |
#encoding= | Sets the character encoding for this context. |
#html? | Determine whether this is an html context. |
#keep_blanks? | Determine whether parsers in this context retain whitespace. |
#options | Returns the parser options for this context. |
#options= | Provides control over the execution of a parser. |
#recovery= | Control whether recovery mode is enabled in this context. |
#recovery? | Determine whether recovery mode is enabled in this context. |
#replace_entities= | Control whether external entity replacement is enabled in this context. |
#replace_entities? | Determine whether external entity replacement is enabled in this context. |
#standalone? | Determine whether this is a standalone context. |
#stats? | Determine whether this context maintains statistics. |
#subset_external? | Determine whether this context is a subset of an external context. |
#subset_internal? | Determine whether this context is a subset of an internal context. |
#validate? | Determine whether validation is enabled in this context. |
#well_formed? | Determine whether this context contains well-formed |
Instance Method Summary
-
#close ⇒ nil
Closes the underlying input streams.
::LibXML::XML::Parser::Context
- Inherited
#close | Closes the underlying input streams. |
#data_directory | Obtain the data directory associated with this context. |
#depth | Obtain the depth of this context. |
#errno | Obtain the last-error number in this context. |
#io_max_num_streams | Obtain the limit on the number of IO streams opened in this context. |
#io_num_streams | Obtain the actual number of IO streams in this context. |
#name_depth | Obtain the name depth for this context. |
#name_depth_max | Obtain the maximum name depth for this context. |
#name_node | Obtain the name node for this context. |
#name_tab | Obtain the name table for this context. |
#node | Obtain the root node of this context. |
#node_depth | Obtain the node depth for this context. |
#node_depth_max | Obtain the maximum node depth for this context. |
#num_chars | Obtain the number of characters in this context. |
#space_depth | Obtain the space depth for this context. |
#space_depth_max | Obtain the maximum space depth for this context. |
#subset_external_system_id | Obtain this context’s external subset system identifier. |
#subset_external_uri | Obtain this context’s external subset URI. |
#subset_internal_name | Obtain this context’s subset name (valid only if either of subset_external? or subset_internal? is true). |
#valid | Determine whether this context is valid. |
#version | Obtain this context’s version identifier. |
Class Method Details
XML::HTMLParser::Context.file(file) ⇒ Context
Creates a new parser context based on the specified file or uri.
Parameters:
file - A filename or uri
- A or'ed together list of LibXML::XML::HTMLParser::Options values
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 152
static VALUE rxml_html_parser_context_file(int argc, VALUE* argv, VALUE klass) { VALUE file, options; rb_scan_args(argc, argv, "11", &file, &options); htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(StringValuePtr(file), NULL); if (!ctxt) rxml_raise(xmlGetLastError()); /* This is annoying, but xmlInitParserCtxt (called indirectly above) and xmlCtxtUseOptionsInternal (called below) initialize slightly different context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */ htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options)); return rxml_html_parser_context_wrap(ctxt); }
XML::HTMLParser::Context.io(io) ⇒ Context
Creates a new parser context based on the specified io object.
Parameters:
io - A ruby IO object
- A or'ed together list of LibXML::XML::HTMLParser::Options values
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 180
static VALUE rxml_html_parser_context_io(int argc, VALUE* argv, VALUE klass) { VALUE io, options; rb_scan_args(argc, argv, "11", &io, &options); VALUE result; htmlParserCtxtPtr ctxt; xmlParserInputBufferPtr input; xmlParserInputPtr stream; if (NIL_P(io)) rb_raise(rb_eTypeError, "Must pass in an IO object"); input = xmlParserInputBufferCreateIO((xmlInputReadCallback) rxml_read_callback, NULL, (void*)io, XML_CHAR_ENCODING_NONE); ctxt = htmlNewParserCtxt(); if (!ctxt) { xmlFreeParserInputBuffer(input); rxml_raise(xmlGetLastError()); } /* This is annoying, but xmlInitParserCtxt (called indirectly above) and xmlCtxtUseOptionsInternal (called below) initialize slightly different context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */ htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options)); stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); if (!stream) { xmlFreeParserInputBuffer(input); xmlFreeParserCtxt(ctxt); rxml_raise(xmlGetLastError()); } inputPush(ctxt, stream); result = rxml_html_parser_context_wrap(ctxt); /* Attach io object to parser so it won't get freed.*/ rb_ivar_set(result, IO_ATTR, io); return result; }
XML::HTMLParser::Context.string(string) ⇒ Context
Creates a new parser context based on the specified string.
Parameters:
string - A string that contains the data to parse
- A or'ed together list of LibXML::XML::HTMLParser::Options values
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 236
static VALUE rxml_html_parser_context_string(int argc, VALUE* argv, VALUE klass) { VALUE string, options; rb_scan_args(argc, argv, "11", &string, &options); Check_Type(string, T_STRING); if (RSTRING_LEN(string) == 0) rb_raise(rb_eArgError, "Must specify a string with one or more characters"); htmlParserCtxtPtr ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(string), (int)RSTRING_LEN(string)); if (!ctxt) rxml_raise(xmlGetLastError()); /* This is annoying, but xmlInitParserCtxt (called indirectly above) and xmlCtxtUseOptionsInternal (called below) initialize slightly different context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */ htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options)); // Setup sax handler // TODO - there must be a better way? The sax handler is initialized for XML, but we want // to use HTML memset(ctxt->sax, 0, sizeof(xmlSAXHandler)); xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax); return rxml_html_parser_context_wrap(ctxt); }
Instance Attribute Details
#disable_cdata=(true|false) (writeonly)
Control whether the CDATA nodes will be created in this context.
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 293
static VALUE rxml_html_parser_context_disable_cdata_set(VALUE self, VALUE value) { htmlParserCtxtPtr ctxt; Data_Get_Struct(self, htmlParserCtxt, ctxt); if (ctxt->sax == NULL) rb_raise(rb_eRuntimeError, "Sax handler is not yet set"); /* LibXML controls this internally with the default SAX handler. */ if (value) ctxt->sax->cdataBlock = NULL; else ctxt->sax->cdataBlock = xmlSAX2CDataBlock; return value; }
#options=(XML::Parser::Options::NOENT |) (writeonly)
XML::Parser::Options::NOCDATA
Provides control over the execution of a parser. Valid values are the constants defined on ::LibXML::XML::Parser::Options
. Multiple options can be combined by using Bitwise OR (|).
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 319
static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options) { int xml_options = NUM2INT(options); htmlParserCtxtPtr ctxt; Check_Type(options, T_FIXNUM); Data_Get_Struct(self, htmlParserCtxt, ctxt); htmlCtxtUseOptions(ctxt, xml_options); #if LIBXML_VERSION >= 20707 /* Big hack here, but htmlCtxtUseOptions doens't support HTML_PARSE_NOIMPLIED. So do it ourselves. There must be a better way??? */ if (xml_options & HTML_PARSE_NOIMPLIED) { ctxt->options |= HTML_PARSE_NOIMPLIED; } #endif return self; }
Instance Method Details
#close ⇒ nil
Closes the underlying input streams. This is useful when parsing a large amount of files and you want to close the files without relying on Ruby’s garbage collector to run.
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 274
static VALUE rxml_html_parser_context_close(VALUE self) { htmlParserCtxtPtr ctxt; xmlParserInputPtr xinput; Data_Get_Struct(self, htmlParserCtxt, ctxt); while ((xinput = inputPop(ctxt)) != NULL) { xmlFreeInputStream(xinput); } return Qnil; }