Class: CSV::Parser
Relationships & Source Files | |
Namespace Children | |
Classes:
| |
Exceptions:
| |
Inherits: | Object |
Defined in: | lib/csv/parser.rb |
Constant Summary
-
SCANNER_TEST =
# File 'lib/csv/parser.rb', line 662(ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
Class Method Summary
- .new(input, options) ⇒ Parser constructor
Instance Attribute Summary
- #header_row? ⇒ Boolean readonly
- #liberal_parsing? ⇒ Boolean readonly
- #return_headers? ⇒ Boolean readonly
- #skip_blanks? ⇒ Boolean readonly
- #unconverted_fields? ⇒ Boolean readonly
- #use_headers? ⇒ Boolean readonly
- #may_quoted? ⇒ Boolean readonly private
Instance Method Summary
- #column_separator
- #field_size_limit
- #headers
- #line
- #lineno
- #parse(&block)
- #quote_character
- #row_separator
- #skip_lines
-
#add_unconverted_fields(row, fields)
private
This method injects an instance variable
unconverted_fields
intorow
and an accessor method forrow
called unconverted_fields(). - #adjust_headers(headers) private
-
#build_scanner
private
See additional method definition at line 682.
- #detect_row_separator(sample, cr, lf) private
- #emit_row(row) {|row| ... } private
- #ignore_broken_line private
- #last_line private
- #parse_column_end private
- #parse_column_value private
- #parse_headers(row) private
- #parse_no_quote(&block) private
- #parse_quotable_loose(&block) private
- #parse_quotable_robust(&block) private
- #parse_quoted_column_value private
- #parse_row_end private
- #parse_unquoted_column_value private
- #prepare private
- #prepare_backslash private
- #prepare_header private
- #prepare_line private
- #prepare_parser private
- #prepare_quote_character private
- #prepare_quoted private
- #prepare_separators private
- #prepare_skip_lines private
- #prepare_strip private
- #prepare_unquoted private
- #prepare_variable private
- #resolve_row_separator(separator) private
- #skip_line?(line) ⇒ Boolean private
- #skip_needless_lines private
- #start_row private
- #strip_value(value) private
Constructor Details
.new(input, options) ⇒ Parser
# File 'lib/csv/parser.rb', line 225
def initialize(input, ) @input = input @options = @samples = [] prepare end
Instance Attribute Details
#header_row? ⇒ Boolean
(readonly)
[ GitHub ]
# File 'lib/csv/parser.rb', line 261
def header_row? @use_headers and @headers.nil? end
#liberal_parsing? ⇒ Boolean
(readonly)
[ GitHub ]
# File 'lib/csv/parser.rb', line 273
def liberal_parsing? @liberal_parsing end
#may_quoted? ⇒ Boolean
(readonly, private)
[ GitHub ]
# File 'lib/csv/parser.rb', line 650
def may_quoted? return false if @quote_character.nil? if @input.is_a?(StringIO) sample = @input.string else return false if @samples.empty? sample = @samples.first end sample[0, 128].index(@quote_character) end
#return_headers? ⇒ Boolean
(readonly)
[ GitHub ]
# File 'lib/csv/parser.rb', line 265
def return_headers? @return_headers end
#skip_blanks? ⇒ Boolean
(readonly)
[ GitHub ]
# File 'lib/csv/parser.rb', line 269
def skip_blanks? @skip_blanks end
#unconverted_fields? ⇒ Boolean
(readonly)
[ GitHub ]
# File 'lib/csv/parser.rb', line 253
def unconverted_fields? @unconverted_fields end
#use_headers? ⇒ Boolean
(readonly)
[ GitHub ]
# File 'lib/csv/parser.rb', line 317
def use_headers? @use_headers end
Instance Method Details
#add_unconverted_fields(row, fields) (private)
This method injects an instance variable unconverted_fields
into row
and an accessor method for row
called unconverted_fields(). The variable is set to the contents of fields
.
# File 'lib/csv/parser.rb', line 1084
def add_unconverted_fields(row, fields) class << row attr_reader :unconverted_fields end row.instance_variable_set(:@unconverted_fields, fields) row end
#adjust_headers(headers) (private)
[ GitHub ]#build_scanner (private)
See additional method definition at line 682.
# File 'lib/csv/parser.rb', line 697
def build_scanner inputs = @samples.collect do |sample| UnoptimizedStringIO.new(sample) end if @input.is_a?(StringIO) inputs << UnoptimizedStringIO.new(@input.string) else inputs << @input end chunk_size = ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1" InputsScanner.new(inputs, @encoding, chunk_size: Integer(chunk_size, 10)) end
#column_separator
[ GitHub ]# File 'lib/csv/parser.rb', line 233
def column_separator @column_separator end
#detect_row_separator(sample, cr, lf) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 570
def detect_row_separator(sample, cr, lf) lf_index = sample.index(lf) if lf_index cr_index = sample[0, lf_index].index(cr) else cr_index = sample.index(cr) end if cr_index and lf_index if cr_index + 1 == lf_index cr + lf elsif cr_index < lf_index cr else lf end elsif cr_index cr elsif lf_index lf else :auto end end
#emit_row(row) {|row| ... } (private)
# File 'lib/csv/parser.rb', line 1055
def emit_row(row, &block) @lineno += 1 raw_row = row if @use_headers if @headers.nil? @headers = adjust_headers(row) return unless @return_headers row = Row.new(@headers, row, true) else row = Row.new(@headers, @fields_converter.convert(raw_row, @headers, @lineno)) end else # convert fields, if needed... row = @fields_converter.convert(raw_row, nil, @lineno) end # inject unconverted fields and accessor, if requested... if @unconverted_fields and not row.respond_to?(:unconverted_fields) add_unconverted_fields(row, raw_row) end yield(row) end
#field_size_limit
[ GitHub ]# File 'lib/csv/parser.rb', line 245
def field_size_limit @field_size_limit end
#headers
[ GitHub ]# File 'lib/csv/parser.rb', line 257
def headers @headers end
#ignore_broken_line (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 1040
def ignore_broken_line @scanner.scan_all(@not_line_end) @scanner.scan_all(@cr_or_lf) @lineno += 1 end
#last_line (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 600
def last_line if @scanner @last_line ||= @scanner.keep_end else @last_line end end
#line
[ GitHub ]# File 'lib/csv/parser.rb', line 281
def line last_line end
#lineno
[ GitHub ]# File 'lib/csv/parser.rb', line 277
def lineno @lineno end
#parse(&block)
[ GitHub ]# File 'lib/csv/parser.rb', line 285
def parse(&block) return to_enum(__method__) unless block_given? if @return_headers and @headers and @raw_headers headers = Row.new(@headers, @raw_headers, true) if @unconverted_fields headers = add_unconverted_fields(headers, []) end yield headers end begin @scanner ||= build_scanner if quote_character.nil? parse_no_quote(&block) elsif @need_robust_parsing parse_quotable_robust(&block) else parse_quotable_loose(&block) end rescue InvalidEncoding if @scanner ignore_broken_line lineno = @lineno else lineno = @lineno + 1 end = "Invalid byte sequence in #{@encoding}" raise MalformedCSVError.new(, lineno) end end
#parse_column_end (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 992
def parse_column_end return true if @scanner.scan(@column_end) return false unless @column_ends @scanner.keep_start if @column_ends.all? {|column_end| @scanner.scan(column_end)} @scanner.keep_drop true else @scanner.keep_back false end end
#parse_column_value (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 894
def parse_column_value if @liberal_parsing quoted_value = parse_quoted_column_value if quoted_value unquoted_value = parse_unquoted_column_value if unquoted_value if @double_quote_outside_quote unquoted_value = unquoted_value.gsub(@quote_character * 2, @quote_character) if quoted_value.empty? # %Q{""...} case return @quote_character + unquoted_value end end @quote_character + quoted_value + @quote_character + unquoted_value else quoted_value end else parse_unquoted_column_value end elsif @may_quoted parse_quoted_column_value || parse_unquoted_column_value else parse_unquoted_column_value || parse_quoted_column_value end end
#parse_headers(row) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 633
def parse_headers(row) CSV.parse_line(row, col_sep: @column_separator, row_sep: @row_separator, quote_char: @quote_character) end
#parse_no_quote(&block) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 753
def parse_no_quote(&block) @scanner.each_line(@row_separator) do |line| next if @skip_lines and skip_line?(line) original_line = line line = line.delete_suffix(@row_separator) if line.empty? next if @skip_blanks row = [] else line = strip_value(line) row = line.split(@split_column_separator, -1) n_columns = row.size i = 0 while i < n_columns row[i] = nil if row[i].empty? i += 1 end end @last_line = original_line emit_row(row, &block) end end
#parse_quotable_loose(&block) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 777
def parse_quotable_loose(&block) @scanner.keep_start @scanner.each_line(@row_separator) do |line| if @skip_lines and skip_line?(line) @scanner.keep_drop @scanner.keep_start next end original_line = line line = line.delete_suffix(@row_separator) if line.empty? if @skip_blanks @scanner.keep_drop @scanner.keep_start next end row = [] elsif line.include?(@cr) or line.include?(@lf) @scanner.keep_back @need_robust_parsing = true return parse_quotable_robust(&block) else row = line.split(@split_column_separator, -1) n_columns = row.size i = 0 while i < n_columns column = row[i] if column.empty? row[i] = nil else n_quotes = column.count(@quote_character) if n_quotes.zero? # no quote elsif n_quotes == 2 and column.start_with?(@quote_character) and column.end_with?(@quote_character) row[i] = column[1..-2] else @scanner.keep_back @need_robust_parsing = true return parse_quotable_robust(&block) end end i += 1 end end @scanner.keep_drop @scanner.keep_start @last_line = original_line emit_row(row, &block) end @scanner.keep_drop end
#parse_quotable_robust(&block) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 832
def parse_quotable_robust(&block) row = [] skip_needless_lines start_row while true @quoted_column_value = false @unquoted_column_value = false @scanner.scan_all(@strip_value) if @strip_value value = parse_column_value if value @scanner.scan_all(@strip_value) if @strip_value if @field_size_limit and value.size >= @field_size_limit ignore_broken_line raise MalformedCSVError.new("Field size exceeded", @lineno) end end if parse_column_end row << value elsif parse_row_end if row.empty? and value.nil? emit_row([], &block) unless @skip_blanks else row << value emit_row(row, &block) row = [] end skip_needless_lines start_row elsif @scanner.eos? break if row.empty? and value.nil? row << value emit_row(row, &block) break else if @quoted_column_value ignore_broken_line = "Any value after quoted field isn't allowed" raise MalformedCSVError.new(, @lineno) elsif @unquoted_column_value and (new_line = @scanner.scan(@cr_or_lf)) ignore_broken_line = "Unquoted fields do not allow new line " + "<#{new_line.inspect}>" raise MalformedCSVError.new(, @lineno) elsif @scanner.rest.start_with?(@quote_character) ignore_broken_line = "Illegal quoting" raise MalformedCSVError.new(, @lineno) elsif (new_line = @scanner.scan(@cr_or_lf)) ignore_broken_line = "New line must be <#{@row_separator.inspect}> " + "not <#{new_line.inspect}>" raise MalformedCSVError.new(, @lineno) else ignore_broken_line raise MalformedCSVError.new("TODO: Meaningful message", @lineno) end end end end
#parse_quoted_column_value (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 948
def parse_quoted_column_value quotes = @scanner.scan_all(@quotes) return nil unless quotes @quoted_column_value = true n_quotes = quotes.size if (n_quotes % 2).zero? quotes[0, (n_quotes - 2) / 2] else value = quotes[0, (n_quotes - 1) / 2] while true quoted_value = @scanner.scan_all(@quoted_value) value << quoted_value if quoted_value if @backslash_quote if @scanner.scan(@escaped_backslash) if @scanner.scan(@escaped_quote) value << @quote_character else value << @backslash_character end next end end quotes = @scanner.scan_all(@quotes) unless quotes ignore_broken_line = "Unclosed quoted field" raise MalformedCSVError.new(, @lineno) end n_quotes = quotes.size if n_quotes == 1 break elsif (n_quotes % 2) == 1 value << quotes[0, (n_quotes - 1) / 2] break else value << quotes[0, n_quotes / 2] end end value end end
#parse_row_end (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 1006
def parse_row_end return true if @scanner.scan(@row_end) return false unless @row_ends @scanner.keep_start if @row_ends.all? {|row_end| @scanner.scan(row_end)} @scanner.keep_drop true else @scanner.keep_back false end end
#parse_unquoted_column_value (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 923
def parse_unquoted_column_value value = @scanner.scan_all(@unquoted_value) return nil unless value @unquoted_column_value = true if @first_column_separators while true @scanner.keep_start is_column_end = @column_ends.all? do |column_end| @scanner.scan(column_end) end @scanner.keep_back break if is_column_end sub_separator = @scanner.scan_all(@first_column_separators) break if sub_separator.nil? value << sub_separator sub_value = @scanner.scan_all(@unquoted_value) break if sub_value.nil? value << sub_value end end value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote value end
#prepare (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 322
def prepare prepare_variable prepare_quote_character prepare_backslash prepare_skip_lines prepare_strip prepare_separators prepare_quoted prepare_unquoted prepare_line prepare_header prepare_parser end
#prepare_backslash (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 379
def prepare_backslash return unless @backslash_quote @backslash_character = "\\".encode(@encoding) @escaped_backslash_character = Regexp.escape(@backslash_character) @escaped_backslash = Regexp.new(@escaped_backslash_character) if @quote_character.nil? @backslash_quote_character = nil else @backslash_quote_character = @backslash_character + @escaped_quote_character end end
#prepare_header (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 608
def prepare_header @return_headers = @options[:return_headers] headers = @options[:headers] case headers when Array @raw_headers = headers @use_headers = true when String @raw_headers = parse_headers(headers) @use_headers = true when nil, false @raw_headers = nil @use_headers = false else @raw_headers = nil @use_headers = true end if @raw_headers @headers = adjust_headers(@raw_headers) else @headers = nil end end
#prepare_line (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 594
def prepare_line @lineno = 0 @last_line = nil @scanner = nil end
#prepare_parser (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 646
def prepare_parser @may_quoted = may_quoted? end
#prepare_quote_character (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 362
def prepare_quote_character @quote_character = @options[:quote_character] if @quote_character.nil? @escaped_quote_character = nil @escaped_quote = nil else @quote_character = @quote_character.to_s.encode(@encoding) if @quote_character.length != 1 = ":quote_char has to be nil or a single character String" raise ArgumentError, end @double_quote_character = @quote_character * 2 @escaped_quote_character = Regexp.escape(@quote_character) @escaped_quote = Regexp.new(@escaped_quote_character) end end
#prepare_quoted (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 489
def prepare_quoted if @quote_character @quotes = Regexp.new(@escaped_quote_character + "+".encode(@encoding)) no_quoted_values = @escaped_quote_character.dup if @backslash_quote no_quoted_values << @escaped_backslash_character end @quoted_value = Regexp.new("[^".encode(@encoding) + no_quoted_values + "]+".encode(@encoding)) end if @escaped_strip @split_column_separator = Regexp.new(@escaped_strip + "*".encode(@encoding) + @escaped_column_separator + @escaped_strip + "*".encode(@encoding)) else if @column_separator == " ".encode(@encoding) @split_column_separator = Regexp.new(@escaped_column_separator) else @split_column_separator = @column_separator end end end
#prepare_separators (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 449
def prepare_separators @column_separator = @options[:column_separator].to_s.encode(@encoding) @row_separator = resolve_row_separator(@options[:row_separator]).encode(@encoding) @escaped_column_separator = Regexp.escape(@column_separator) @escaped_first_column_separator = Regexp.escape(@column_separator[0]) if @column_separator.size > 1 @column_end = Regexp.new(@escaped_column_separator) @column_ends = @column_separator.each_char.collect do |char| Regexp.new(Regexp.escape(char)) end @first_column_separators = Regexp.new(@escaped_first_column_separator + "+".encode(@encoding)) else if @@string_scanner_scan_accept_string @column_end = @column_separator else @column_end = Regexp.new(@escaped_column_separator) end @column_ends = nil @first_column_separators = nil end escaped_row_separator = Regexp.escape(@row_separator) @row_end = Regexp.new(escaped_row_separator) if @row_separator.size > 1 @row_ends = @row_separator.each_char.collect do |char| Regexp.new(Regexp.escape(char)) end else @row_ends = nil end @cr = "\r".encode(@encoding) @lf = "\n".encode(@encoding) @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding)) @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding)) end
#prepare_skip_lines (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 394
def prepare_skip_lines skip_lines = @options[:skip_lines] case skip_lines when String @skip_lines = skip_lines.encode(@encoding) when Regexp, nil @skip_lines = skip_lines else unless skip_lines.respond_to?(:match) = ":skip_lines has to respond to \#match: #{skip_lines.inspect}" raise ArgumentError, end @skip_lines = skip_lines end end
#prepare_strip (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 411
def prepare_strip @strip = @options[:strip] @escaped_strip = nil @strip_value = nil if @strip.is_a?(String) case @strip.length when 0 raise ArgumentError, ":strip must not be an empty String" when 1 # ok else raise ArgumentError, ":strip doesn't support 2 or more characters yet" end @strip = @strip.encode(@encoding) @escaped_strip = Regexp.escape(@strip) if @quote_character @strip_value = Regexp.new(@escaped_strip + "+".encode(@encoding)) end @need_robust_parsing = true elsif @strip strip_values = " \t\r\n\f\v" @escaped_strip = strip_values.encode(@encoding) if @quote_character @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding)) end @need_robust_parsing = true end end
#prepare_unquoted (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 516
def prepare_unquoted return if @quote_character.nil? no_unquoted_values = "\r\n".encode(@encoding) no_unquoted_values << @escaped_first_column_separator unless @liberal_parsing no_unquoted_values << @escaped_quote_character end if @escaped_strip no_unquoted_values << @escaped_strip end @unquoted_value = Regexp.new("[^".encode(@encoding) + no_unquoted_values + "]+".encode(@encoding)) end
#prepare_variable (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 336
def prepare_variable @need_robust_parsing = false @encoding = @options[:encoding] liberal_parsing = @options[:liberal_parsing] if liberal_parsing @liberal_parsing = true if liberal_parsing.is_a?(Hash) @double_quote_outside_quote = liberal_parsing[:double_quote_outside_quote] @backslash_quote = liberal_parsing[:backslash_quote] else @double_quote_outside_quote = false @backslash_quote = false end @need_robust_parsing = true else @liberal_parsing = false @backslash_quote = false end @unconverted_fields = @options[:unconverted_fields] @field_size_limit = @options[:field_size_limit] @skip_blanks = @options[:skip_blanks] @fields_converter = @options[:fields_converter] @header_fields_converter = @options[:header_fields_converter] end
#quote_character
[ GitHub ]# File 'lib/csv/parser.rb', line 241
def quote_character @quote_character end
#resolve_row_separator(separator) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 532
def resolve_row_separator(separator) if separator == :auto cr = "\r".encode(@encoding) lf = "\n".encode(@encoding) if @input.is_a?(StringIO) separator = detect_row_separator(@input.string, cr, lf) elsif @input.respond_to?(:gets) if @input.is_a?(File) chunk_size = 32 * 1024 else chunk_size = 1024 end begin while separator == :auto # # if we run out of data, it's probably a single line # (ensure will set default value) # break unless sample = @input.gets(nil, chunk_size) # extend sample if we're unsure of the line ending if sample.end_with?(cr) sample << (@input.gets(nil, 1) || "") end @samples << sample separator = detect_row_separator(sample, cr, lf) end rescue IOError # do nothing: ensure will set default end end separator = $INPUT_RECORD_SEPARATOR if separator == :auto end separator.to_s.encode(@encoding) end
#row_separator
[ GitHub ]# File 'lib/csv/parser.rb', line 237
def row_separator @row_separator end
#skip_line?(line) ⇒ Boolean
(private)
#skip_lines
[ GitHub ]# File 'lib/csv/parser.rb', line 249
def skip_lines @skip_lines end
#skip_needless_lines (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 725
def skip_needless_lines return unless @skip_lines while true @scanner.keep_start line = @scanner.scan_all(@not_line_end) || "".encode(@encoding) line << @row_separator if parse_row_end if skip_line?(line) @lineno += 1 @scanner.keep_drop else @scanner.keep_back return end end end
#start_row (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 1046
def start_row if @last_line @last_line = nil else @scanner.keep_drop end @scanner.keep_start end
#strip_value(value) (private)
[ GitHub ]# File 'lib/csv/parser.rb', line 1019
def strip_value(value) return value unless @strip return nil if value.nil? case @strip when String size = value.size while value.start_with?(@strip) size -= 1 value = value[1, size] end while value.end_with?(@strip) size -= 1 value = value[0, size] end else value.strip! end value end