Skip to content

Commit

Permalink
colon separator support added
Browse files Browse the repository at this point in the history
  • Loading branch information
Guido Scicolone authored and jkeen committed Oct 27, 2022
1 parent 0515fe6 commit 35f1757
Show file tree
Hide file tree
Showing 22 changed files with 683 additions and 55 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
.DS_Store
.byebug_history
test-file.txt
.idea/
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ You can use this in a ruby program by using installing the `comma_splice` gem, o

```ruby
CommaSplice::FileCorrector.new(file_path).bad_lines.size

#you can specify another separator
CommaSplice::FileCorrector.new(file_path, separator: ';').bad_lines.size
```
```
comma_splice bad_line_count /path/to/file.csv
Expand All @@ -95,6 +98,9 @@ You can use this in a ruby program by using installing the `comma_splice` gem, o
##### Display the fixed contents
```ruby
CommaSplice::FileCorrector.new(file_path).corrected

#you can specify another separator
CommaSplice::FileCorrector.new(file_path, separator: ';').corrected
```
```bash
comma_splice correct /path/to/file.csv
Expand All @@ -103,6 +109,9 @@ You can use this in a ruby program by using installing the `comma_splice` gem, o
##### Process a file and save the fixed version
```ruby
CommaSplice::FileCorrector.new(file_path).save(save_path)

#you can specify another separator
CommaSplice::FileCorrector.new(file_path, separator: ';').save(save_path)
```
```bash
comma_splice fix /path/to/file.csv /path/to/save
Expand Down
13 changes: 9 additions & 4 deletions bin/comma_splice
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class CommaSpliceCLI < Thor
class_option :start_line, type: :numeric, default: nil
class_option :end_line, type: :numeric, default: nil
class_option :debug, type: :boolean, default: false
class_option :separator, type: :string, default: ','

desc 'version', 'print the current comma_splice version'
def version
Expand All @@ -21,7 +22,8 @@ class CommaSpliceCLI < Thor
file_corrector = CommaSplice::FileCorrector.new(
file_path,
start_line: options[:start_line],
end_line: options[:end_line]
end_line: options[:end_line],
separator: options[:separator]
)

puts file_corrector.corrected
Expand All @@ -34,7 +36,8 @@ class CommaSpliceCLI < Thor
file_corrector = CommaSplice::FileCorrector.new(
file_path,
start_line: options[:start_line],
end_line: options[:end_line]
end_line: options[:end_line],
separator: options[:separator]
)

file_corrector.save(fix_path)
Expand All @@ -47,7 +50,8 @@ class CommaSpliceCLI < Thor
file_corrector = CommaSplice::FileCorrector.new(
file_path,
start_line: options[:start_line],
end_line: options[:end_line]
end_line: options[:end_line],
separator: options[:separator]
)

puts file_corrector.bad_lines
Expand All @@ -60,7 +64,8 @@ class CommaSpliceCLI < Thor
file_corrector = CommaSplice::FileCorrector.new(
file_path,
start_line: options[:start_line],
end_line: options[:end_line]
end_line: options[:end_line],
separator: options[:separator]
)

puts file_corrector.bad_lines.size
Expand Down
19 changes: 11 additions & 8 deletions lib/comma_splice/file_corrector.rb
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
module CommaSplice
class FileCorrector
attr_reader :file_contents, :csv_content, :start_line, :end_line, :start_column, :end_column
attr_reader :file_contents, :csv_content, :start_line, :end_line, :start_column, :end_column, :separator

def initialize(file_path, start_line: nil, end_line:nil, start_column: nil, end_column: nil)
def initialize(file_path, start_line: nil, end_line: nil, start_column: nil, end_column: nil, separator: ',')
@file_path = file_path
@file_contents = File.read(file_path, encoding: 'utf-8')

@content_finder = ContentFinder.new(@file_contents, start_line, end_line)
@content_finder = ContentFinder.new(@file_contents, start_line, end_line, separator)
@csv_content = @content_finder.content
@start_line = @content_finder.start_line
@end_line = @content_finder.end_line

@separator = @content_finder.separator
if start_column && end_column
@start_column = start_column
@end_column = end_column
else
finder = VariableColumnFinder.new(@csv_content[0], @csv_content[1..-1])
finder = VariableColumnFinder.new(@csv_content[0], @csv_content[1..-1], @separator)
@start_column = finder.start_column
@end_column = finder.end_column
end
Expand All @@ -24,7 +24,7 @@ def initialize(file_path, start_line: nil, end_line:nil, start_column: nil, end_
end

def header
@header ||= Line.new(csv_content.first)
@header ||= Line.new(csv_content.first, separator)
end

def bad_lines
Expand Down Expand Up @@ -60,20 +60,23 @@ def save(path)
end
end

def to_json
def to_json(*_args)
@content_finder.parsed.try(:to_json)
end

private

def line_correctors
@line_correctors ||= csv_content.collect do |line|
LineCorrector.new(header, Line.new(line), @start_column, @end_column)
LineCorrector.new(header, Line.new(line, separator), @start_column, @end_column, separator)
end
end

def corrected_lines
line_correctors.collect do |line|
# if line.original.include?('http://www.affaritaliani.it/economia/tim-fulvio-conti-resta-smentita-ipotesi-di-passaggio-a-open-fiber-597756.html')
# puts line
# end
if line.needs_correcting?
line.corrected
else
Expand Down
5 changes: 5 additions & 0 deletions lib/comma_splice/helpers/comma_calculator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class CommaCalculator
def initialize(headers, values)
raise StandardError, "Determining all the possibilities to fit #{values.size} values into the #{headers.size} headers #{headers.inspect} is computationally expensive. Please specify the columns where commas might be." if headers.size > 10 && values.size > 10

@separator = separator
@headers = headers
@values = values
@longest_header = @headers.max_by(&:length)
Expand Down Expand Up @@ -66,6 +67,10 @@ def print_all_options
ranked_options.each_with_index do |option, index|
print_option(option, index)
end
private

def quoted_values(values)
"\"#{values.join(@separator).gsub(/(?<!")(?:"{2})*\K\"/, '""')}\"" # escape a double quote if it hasn't been escaped already
end

protected
Expand Down
12 changes: 6 additions & 6 deletions lib/comma_splice/helpers/content_finder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ module CommaSplice
# Given a file this will find the CSV content. Some files have some non-csv junk at the top

class ContentFinder
attr_reader :start_line, :end_line, :content
attr_reader :start_line, :end_line, :content, :separator

def initialize(file_contents, start_line = nil, end_line = nil)
def initialize(file_contents, start_line = nil, end_line = nil, separator = ',')
@file_contents = file_contents

@separator = separator
if start_line && end_line
# the csvs this was built for have non-csv headers
@start_line = start_line
Expand All @@ -21,11 +21,11 @@ def initialize(file_contents, start_line = nil, end_line = nil)

def find_content
@start_line = @file_contents.lines.find_index do |line|
Line.new(line).values.size > 2
Line.new(line, separator).values.size > 2
end

relative_end_line = @file_contents.lines[@start_line..-1].find_index do |line|
Line.new(line).values.size < 2
Line.new(line, separator).values.size < 2
end

@end_line = if relative_end_line
Expand All @@ -40,7 +40,7 @@ def find_content
def parsed
quote_chars = %w[" | ~ ^ & *]
begin
CSV.parse(@content.join("\n"), quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
CSV.parse(@content.join('\n'), col_sep: separator, quote_char: quote_chars.shift, headers: :first_row, liberal_parsing: true)
rescue CSV::MalformedCSVError
quote_chars.empty? ? raise : retry
end
Expand Down
7 changes: 4 additions & 3 deletions lib/comma_splice/helpers/line.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
module CommaSplice
class Line
attr_reader :values, :line
attr_reader :values, :line, :separator

def initialize(line)
def initialize(line, separator = ',')
@line = line
@separator = separator
@values = parse_csv_content(line).first
end

Expand All @@ -12,7 +13,7 @@ def initialize(line)
def parse_csv_content(content, headers = false)
quote_chars = %w[" | ~ ^ & *]
begin
CSV.parse(content.mb_chars.tidy_bytes.to_s, quote_char: quote_chars.shift, headers: headers, liberal_parsing: true)
CSV.parse(content.mb_chars.tidy_bytes.to_s, col_sep: separator, quote_char: quote_chars.shift, headers: headers, liberal_parsing: true)
rescue CSV::MalformedCSVError
quote_chars.empty? ? raise : retry
end
Expand Down
14 changes: 7 additions & 7 deletions lib/comma_splice/helpers/variable_column_finder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ module CommaSplice
# 17385094,,,01-27-2019 @ 13:47:00,KIng Tubby Meets The Upsetter,King And The Upsetter At Spanish Town,KIng Tubby Meets The Upsetter,Celluloid,post,live,y,

class VariableColumnFinder
attr_reader :start_column, :end_column
attr_reader :start_column, :end_column, :separator

def initialize(header_line, value_lines)
def initialize(header_line, value_lines, separator = ',')
@values = value_lines
@header = header_line

@separator = separator
find_variable_column_boundaries
end

Expand All @@ -44,9 +44,9 @@ def find_variable_column_boundaries

def left_to_right_index
left_to_right_index = []
@header.split(',').size.times do |time|
@header.split(separator).size.times do |time|
left_to_right_index.push(@values.map do |value_line|
value_line.split(',')[time].to_s.size
value_line.split(separator)[time].to_s.size
end.uniq.size == 1)
end

Expand All @@ -55,9 +55,9 @@ def left_to_right_index

def right_to_left_index
right_to_left_index = []
@header.split(',').size.times do |time|
@header.split(separator).size.times do |time|
right_to_left_index.unshift(@values.map do |value_line|
value_line.split(',')[-time].to_s.size
value_line.split(separator)[-time].to_s.size
end.uniq.size == 1)
end

Expand Down
13 changes: 7 additions & 6 deletions lib/comma_splice/line_corrector.rb
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
module CommaSplice
class LineCorrector
attr_reader :headers, :values, :header_line, :value_line, :right_bounds, :left_bounds
attr_reader :headers, :values, :header_line, :value_line, :right_bounds, :left_bounds, :separator

def initialize(header_line, value_line, left_bounds = 0, right_bounds = -1)
header_line = Line.new(header_line) unless header_line.is_a?(Line)
value_line = Line.new(value_line) unless value_line.is_a?(Line)
def initialize(header_line, value_line, left_bounds = 0, right_bounds = -1, separator = ',')
header_line = Line.new(header_line, separator) unless header_line.is_a?(Line)
value_line = Line.new(value_line, separator) unless value_line.is_a?(Line)

@header_line = header_line
@value_line = value_line
@headers = header_line.values
@values = value_line.values
@left_bounds = left_bounds
@right_bounds = right_bounds
@separator = separator

raise 'right bounds must be negative' unless right_bounds.negative?
raise 'left bounds must be not be negative' if left_bounds.negative?
Expand Down Expand Up @@ -59,11 +60,11 @@ def print_all_options
protected

def corrector
CommaCalculator.new(selected_headers, selected_values)
CommaCalculator.new(selected_headers, selected_values, separator)
end

def generate_csv_line(values)
CSV.generate_line(values)
CSV.generate_line(values, col_sep: @separator)
end

def selected_headers
Expand Down
Loading

0 comments on commit 35f1757

Please sign in to comment.