-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcomparison_benchmark.rb
executable file
·190 lines (163 loc) · 5.48 KB
/
comparison_benchmark.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env ruby
# frozen_string_literal: true
require "benchmark/ips"
require "csv"
require "osv"
require "fastcsv"
require "stringio"
require "zlib"
require "fileutils"
RubyVM::YJIT.enable
# Generate a larger test file for more meaningful benchmarks
def generate_test_data(rows = 1_000_000)
if File.exist?("benchmark/test.csv")
age_total = 0
CSV.foreach("benchmark/test.csv", headers: true) { |row| age_total += row["age"].to_i }
return StringIO.new(File.read("benchmark/test.csv")), age_total
end
age = 0
headers = %w[
id
name
age
email
city
country
salary
department
hire_date
manager_id
performance_score
project_count
active
notes
last_login
description
skills
address
]
CSV.open("benchmark/test.csv", "w", write_headers: true, headers: headers) do |csv|
rows.times do |i|
row_age = rand(18..80)
age += row_age
csv << [
i,
"Person#{i}",
row_age,
"person#{i}@example.com",
"City#{i}",
"Country#{i}",
rand(30_000..200_000),
%w[Engineering Sales Marketing HR Finance].sample,
"2020-#{rand(1..12)}-#{rand(1..28)}",
rand(1..1000),
rand(1..5).to_f,
rand(1..10),
[true, false].sample,
"",
"",
# Large quoted text with commas and quotes
"A very long description of person #{i}'s background, including multiple, comma-separated clauses. The person has \"special\" skills and experience in various fields.",
# Array-like quoted text with commas
"Ruby,Python,JavaScript,\"DevOps\",\"Cloud Architecture\"",
# Address with embedded newlines and quotes
"123 Main St.\nApt \"B\"\nSuite 100"
]
end
end
file_string = File.read("benchmark/test.csv")
Zlib::GzipWriter.open("benchmark/test.csv.gz") do |gz|
CSV
.new(gz, write_headers: true, headers: headers)
.tap { |csv| CSV.parse(file_string, headers: true) { |row| csv << row } }
end
str = StringIO.new(file_string)
[str, age]
end
TEST_FILES = %w[benchmark/test.csv benchmark/test.csv.gz].freeze
begin
# Create test files
test_data, age = generate_test_data
# Create gzipped version
puts "Benchmarking with #{`wc -l benchmark/test.csv`.to_i} lines of data\n\n"
Benchmark.ips do |x|
x.config(time: 30, warmup: 5)
x.report("CSV - StringIO") do
count = 0
io = StringIO.new(test_data.string)
CSV.new(io).each { |row| count += row[2].to_i }
io.close
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("FastCSV - StringIO") do
count = 0
io = StringIO.new(test_data.string)
FastCSV.raw_parse(io) { |row| count += row[2].to_i }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("OSV - StringIO") do
count = 0
io = StringIO.new(test_data.string)
OSV.for_each(io, result_type: :array) { |row| count += row[2].to_i }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("CSV - Hash output") do
count = 0
File.open("benchmark/test.csv") { |f| CSV.new(f, headers: true).each { |row| count += row["age"].to_i } }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("OSV - Hash output") do
count = 0
File.open("benchmark/test.csv") { |f| OSV.for_each(f) { |row| count += row["age"].to_i } }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("CSV - Array output") do
count = 0
File.open("benchmark/test.csv") { |f| CSV.new(f).each { |row| count += row[2].to_i } }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("OSV - Array output") do
count = 0
File.open("benchmark/test.csv") { |f| OSV.for_each(f, result_type: :array) { |row| count += row[2].to_i } }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("FastCSV - Array output") do
count = 0
File.open("benchmark/test.csv") { |f| FastCSV.raw_parse(f) { |row| count += row[2].to_i } }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("OSV - Direct Open Array output") do
count = 0
OSV.for_each("benchmark/test.csv", result_type: :array) { |row| count += row[2].to_i }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("OSV - Gzipped") do
count = 0
Zlib::GzipReader.open("benchmark/test.csv.gz") do |gz|
OSV.for_each(gz, result_type: :array) { |row| count += row[2].to_i }
end
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("OSV - Gzipped Direct") do
count = 0
OSV.for_each("benchmark/test.csv.gz", result_type: :array) { |row| count += row[2].to_i }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("FastCSV - Gzipped") do
count = 0
Zlib::GzipReader.open("benchmark/test.csv.gz") { |gz| FastCSV.raw_parse(gz) { |row| count += row[2].to_i } }
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.report("CSV - Gzipped") do
count = 0
Zlib::GzipReader.open("benchmark/test.csv.gz") do |gz|
CSV.new(gz, headers: true).each { |row| count += row["age"].to_i }
end
raise "Age mismatch: #{age} != #{count}" if age != count
end
x.compare!
end
ensure
# Cleanup test files even if the script fails or is interrupted
# FileUtils.rm_f(TEST_FILES)
end