Class: LibXML::XML::HTMLParser::Context
| Relationships & Source Files | |
| Super Chains via Extension / Inclusion / Inheritance | |
|
Class Chain:
|
|
|
Instance Chain:
|
|
| Inherits: |
LibXML::XML::Parser::Context
|
| Defined in: | ext/libxml/ruby_xml_html_parser_context.c, ext/libxml/ruby_xml_html_parser_context.c |
Overview
The Context class provides in-depth control over how a document is parsed.
Class Method Summary
-
XML::HTMLParser::Context.file(file) ⇒ Context
Creates a new parser context based on the specified file or uri.
-
XML::HTMLParser::Context.io(io) ⇒ Context
Creates a new parser context based on the specified io object.
-
XML::HTMLParser::Context.string(string) ⇒ Context
Creates a new parser context based on the specified string.
::LibXML::XML::Parser::Context - Inherited
Instance Attribute Summary
-
#disable_cdata=(true|false)
writeonly
Control whether the CDATA nodes will be created in this context.
-
#options=(XML::Parser::Options::NOENT |)
writeonly
Provides control over the execution of a parser.
::LibXML::XML::Parser::Context - Inherited
| #base_uri | Obtain the base url for this parser context. |
| #base_uri= | Sets the base url for this parser context. |
| #disable_cdata= | Control whether CDATA nodes will be created in this context. |
| #disable_cdata? | Determine whether CDATA nodes will be created in this context. |
| #disable_sax? | Determine whether SAX-based processing is disabled in this context. |
| #docbook? | Determine whether this is a docbook context. |
| #encoding | Obtain the character encoding identifier used in this context. |
| #encoding= | Sets the character encoding for this context. |
| #html? | Determine whether this is an html context. |
| #keep_blanks? | Determine whether parsers in this context retain whitespace. |
| #options | Returns the parser options for this context. |
| #options= | Provides control over the execution of a parser. |
| #recovery= | Control whether recovery mode is enabled in this context. |
| #recovery? | Determine whether recovery mode is enabled in this context. |
| #replace_entities= | Control whether external entity replacement is enabled in this context. |
| #replace_entities? | Determine whether external entity replacement is enabled in this context. |
| #standalone? | Determine whether this is a standalone context. |
| #stats? | Determine whether this context maintains statistics. |
| #subset_external? | Determine whether this context is a subset of an external context. |
| #subset_internal? | Determine whether this context is a subset of an internal context. |
| #validate? | Determine whether validation is enabled in this context. |
| #well_formed? | Determine whether this context contains well-formed |
Instance Method Summary
-
#close ⇒ nil
Closes the underlying input streams.
::LibXML::XML::Parser::Context - Inherited
| #close | Closes the underlying input streams. |
| #data_directory | Obtain the data directory associated with this context. |
| #depth | Obtain the depth of this context. |
| #errno | Obtain the last-error number in this context. |
| #io_max_num_streams | Obtain the limit on the number of IO streams opened in this context. |
| #io_num_streams | Obtain the actual number of IO streams in this context. |
| #name_depth | Obtain the name depth for this context. |
| #name_depth_max | Obtain the maximum name depth for this context. |
| #name_node | Obtain the name node for this context. |
| #name_tab | Obtain the name table for this context. |
| #node | Obtain the root node of this context. |
| #node_depth | Obtain the node depth for this context. |
| #node_depth_max | Obtain the maximum node depth for this context. |
| #num_chars | Obtain the number of characters in this context. |
| #space_depth | Obtain the space depth for this context. |
| #space_depth_max | Obtain the maximum space depth for this context. |
| #subset_external_system_id | Obtain this context’s external subset system identifier. |
| #subset_external_uri | Obtain this context’s external subset URI. |
| #subset_internal_name | Obtain this context’s subset name (valid only if either of subset_external? or subset_internal? is true). |
| #valid | Determine whether this context is valid. |
| #version | Obtain this context’s version identifier. |
Class Method Details
XML::HTMLParser::Context.file(file) ⇒ Context
Creates a new parser context based on the specified file or uri.
Parameters:
file - A filename or uri
- A or'ed together list of LibXML::XML::HTMLParser::Options values
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 152
static VALUE rxml_html_parser_context_file(int argc, VALUE* argv, VALUE klass)
{
VALUE file, options;
rb_scan_args(argc, argv, "11", &file, &options);
htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(StringValuePtr(file), NULL);
if (!ctxt)
rxml_raise(xmlGetLastError());
/* This is annoying, but xmlInitParserCtxt (called indirectly above) and
xmlCtxtUseOptionsInternal (called below) initialize slightly different
context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */
htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));
return rxml_html_parser_context_wrap(ctxt);
}
XML::HTMLParser::Context.io(io) ⇒ Context
Creates a new parser context based on the specified io object.
Parameters:
io - A ruby IO object
- A or'ed together list of LibXML::XML::HTMLParser::Options values
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 180
static VALUE rxml_html_parser_context_io(int argc, VALUE* argv, VALUE klass)
{
VALUE io, options;
rb_scan_args(argc, argv, "11", &io, &options);
VALUE result;
htmlParserCtxtPtr ctxt;
xmlParserInputBufferPtr input;
xmlParserInputPtr stream;
if (NIL_P(io))
rb_raise(rb_eTypeError, "Must pass in an IO object");
input = xmlParserInputBufferCreateIO((xmlInputReadCallback) rxml_read_callback, NULL,
(void*)io, XML_CHAR_ENCODING_NONE);
ctxt = htmlNewParserCtxt();
if (!ctxt)
{
xmlFreeParserInputBuffer(input);
rxml_raise(xmlGetLastError());
}
/* This is annoying, but xmlInitParserCtxt (called indirectly above) and
xmlCtxtUseOptionsInternal (called below) initialize slightly different
context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */
htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
if (!stream)
{
xmlFreeParserInputBuffer(input);
xmlFreeParserCtxt(ctxt);
rxml_raise(xmlGetLastError());
}
inputPush(ctxt, stream);
result = rxml_html_parser_context_wrap(ctxt);
/* Attach io object to parser so it won't get freed.*/
rb_ivar_set(result, IO_ATTR, io);
return result;
}
XML::HTMLParser::Context.string(string) ⇒ Context
Creates a new parser context based on the specified string.
Parameters:
string - A string that contains the data to parse
- A or'ed together list of LibXML::XML::HTMLParser::Options values
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 236
static VALUE rxml_html_parser_context_string(int argc, VALUE* argv, VALUE klass)
{
VALUE string, options;
rb_scan_args(argc, argv, "11", &string, &options);
Check_Type(string, T_STRING);
if (RSTRING_LEN(string) == 0)
rb_raise(rb_eArgError, "Must specify a string with one or more characters");
htmlParserCtxtPtr ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(string),
(int)RSTRING_LEN(string));
if (!ctxt)
rxml_raise(xmlGetLastError());
/* This is annoying, but xmlInitParserCtxt (called indirectly above) and
xmlCtxtUseOptionsInternal (called below) initialize slightly different
context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */
htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));
// Setup sax handler
// TODO - there must be a better way? The sax handler is initialized for XML, but we want
// to use HTML
memset(ctxt->sax, 0, sizeof(xmlSAXHandler));
xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
return rxml_html_parser_context_wrap(ctxt);
}
Instance Attribute Details
#disable_cdata=(true|false) (writeonly)
Control whether the CDATA nodes will be created in this context.
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 293
static VALUE rxml_html_parser_context_disable_cdata_set(VALUE self, VALUE value)
{
htmlParserCtxtPtr ctxt;
Data_Get_Struct(self, htmlParserCtxt, ctxt);
if (ctxt->sax == NULL)
rb_raise(rb_eRuntimeError, "Sax handler is not yet set");
/* LibXML controls this internally with the default SAX handler. */
if (value)
ctxt->sax->cdataBlock = NULL;
else
ctxt->sax->cdataBlock = xmlSAX2CDataBlock;
return value;
}
#options=(XML::Parser::Options::NOENT |) (writeonly)
XML::Parser::Options::NOCDATA
Provides control over the execution of a parser. Valid values are the constants defined on ::LibXML::XML::Parser::Options. Multiple options can be combined by using Bitwise OR (|).
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 319
static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options)
{
int xml_options = NUM2INT(options);
htmlParserCtxtPtr ctxt;
Check_Type(options, T_FIXNUM);
Data_Get_Struct(self, htmlParserCtxt, ctxt);
htmlCtxtUseOptions(ctxt, xml_options);
#if LIBXML_VERSION >= 20707
/* Big hack here, but htmlCtxtUseOptions doens't support HTML_PARSE_NOIMPLIED.
So do it ourselves. There must be a better way??? */
if (xml_options & HTML_PARSE_NOIMPLIED)
{
ctxt->options |= HTML_PARSE_NOIMPLIED;
}
#endif
return self;
}
Instance Method Details
#close ⇒ nil
Closes the underlying input streams. This is useful when parsing a large amount of files and you want to close the files without relying on Ruby’s garbage collector to run.
# File 'ext/libxml/ruby_xml_html_parser_context.c', line 274
static VALUE rxml_html_parser_context_close(VALUE self)
{
htmlParserCtxtPtr ctxt;
xmlParserInputPtr xinput;
Data_Get_Struct(self, htmlParserCtxt, ctxt);
while ((xinput = inputPop(ctxt)) != NULL)
{
xmlFreeInputStream(xinput);
}
return Qnil;
}