-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml2csv
executable file
·115 lines (83 loc) · 2.3 KB
/
html2csv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/perl
# Output HTML tables in CSV
# Copyright (C) 2014 Bruno BEAUFILS <[email protected]>
#
# This software comes with ABSOLUTELY NO WARRANTY.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation in its version 2.
# See the README or COPYING file for details.
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use File::Temp qw/tempfile/;
use HTML::TableExtract;
use open qw/:std :utf8/; # Ensure UTF-8 support
# La documentation
=pod
=encoding UTF-8
=head1 NAME
html2csv - Export HTML tables into CSV
=head1 SYNOPSIS
=over
=item html2csv [OPTIONS...] [FILE...]
=item html2csv -h
=back
=head1 OPTIONS
=over
=item B<-s> I<STRING>, --separator I<STRING>
Use I<STRING> instead of comma as field separator.
=item B<-n>, --no-protection
Do not quote data in each field.
=item B<-q> I<CHAR>, --quote I<CHAR>
Use I<CHAR> instead of double-quote for data quotation.
=item B<-h>, B<--help>
Print short help message.
=item B<--man>
Print full documentation.
=back
=head1 DESCRIPTION
Print data found in HTML table read from standard input (or specified files)
in CSV (comma-separated values). Each field is double-quoted and separated by
comma.
=cut
# Command line parameters
my $separator = ",";
my $quote = '"';
my $protect = 1;
if (!GetOptions('separator|s=s' => \$separator,
'quote|q=s' => \$quote,
'no-protection|n' => sub { $protect = 0; },
'man' => sub { pod2usage(-verbose=>2, -noperldoc=>1); },
'help|h' => sub { pod2usage(-verbose=>1, -noperldoc=>1); })) {
pod2usage("Syntax error!\n");
}
# Table::Extract object construction
my $te = HTML::TableExtract->new();
# Parse HTML data from files
local $/;
$te->parse(<>);
# Process every tables
foreach my $ts ($te->tables) {
foreach my $row ($ts->rows) {
# Protect cells content
if ($protect) {
map {
if ($_) {
$_ =~ s/$quote/$quote$quote/g;
$_ = "$quote$_$quote";
}
} (@$row);
}
# I cannot use join because some cell may be undefined (if empty)
foreach (@{$row}) {
if ($_) {
print "$_$separator";
} else {
print "$separator";
}
}
print "\n";
}
}