-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtable-harvester.js
249 lines (215 loc) · 8.43 KB
/
table-harvester.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env node
// --------------------------------------------------------------- //
// table-harvester.js
// Description: A script for extracting table data from HTML files
// and converting it into CSV format.
// --------------------------------------------------------------- //
// --------------------------------------------------------------- //
// Module Imports
// --------------------------------------------------------------- //
const fs = require('fs');
const path = require('path');
const cheerio = require('cheerio');
const yargs = require('yargs');
const { Parser } = require('json2csv');
const jschardet = require('jschardet');
const iconv = require('iconv-lite');
// --------------------------------------------------------------- //
// Command-line Argument Configuration
// --------------------------------------------------------------- //
// Configures command-line arguments for the script using yargs.
// This allows users to specify input, output, and other options.
const argv = yargs
.usage('# table-harvester.js\n# Description: A script for extracting table data from HTML files\n# and converting it into CSV format.\n# ____\n#\ i | o|\n#|>#######\n#/(_______)\n#\nUsage: $0 -i [input] -o [output] -a [attributes] -l [log]')
.option('input', {
alias: 'i',
describe: 'Input HTML file or directory',
type: 'string',
demandOption: true,
})
.option('output', {
alias: 'o',
describe: 'Output directory',
type: 'string',
demandOption: true,
})
.option('attributes', {
alias: 'a',
describe: 'Attributes to extract',
type: 'array',
default: ['alt', 'value', 'href', 'title'],
})
.option('elements', {
alias: 'e',
describe: 'HTML elements to extract from cells',
type: 'array',
default: ['a', 'input', 'span'],
})
.option('headerSelectors', {
alias: 'hs',
describe: 'Selectors to identify header elements preceding tables',
type: 'array',
default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', '.header'],
})
.option('tableNameSeparator', {
alias: 'ts',
describe: 'Separator for extracting table name from header text',
type: 'string',
default: ':',
})
.option('log', {
alias: 'l',
describe: 'Log file path',
type: 'string',
})
.help()
.argv;
// --------------------------------------------------------------- //
// Utility Functions
// --------------------------------------------------------------- //
/**
* Checks if a given path is a directory.
* @param {string} source - Path to check.
* @returns {boolean} True if the path is a directory, false otherwise.
*/
const isDirectory = source => fs.lstatSync(source).isDirectory();
/**
* Reads an HTML file with the correct encoding.
* @param {string} filePath - The path to the HTML file.
* @returns {string} The content of the file in UTF-8 encoding.
*/
function readHtmlFile(filePath) {
const fileBuffer = fs.readFileSync(filePath);
const detectedEncoding = jschardet.detect(fileBuffer).encoding;
return iconv.decode(fileBuffer, detectedEncoding);
}
/**
* Retrieves HTML files from a directory or returns a single file in an array.
* @param {string} source - Directory path or file path.
* @returns {Array<string>} Array of file paths.
*/
const getHtmlFiles = source =>
isDirectory(source)
? fs.readdirSync(source).filter(file => file.endsWith('.html')).map(file => path.join(source, file))
: [source];
/**
* Normalizes a given string to a column name in snake_case.
* This involves converting to lower case, replacing non-alphanumeric
* characters with underscores, and trimming leading and trailing underscores.
*
* @param {string} name - The string to be normalized.
* @returns {string} The normalized column name in snake_case.
*/
function normalizeColumnName(name) {
// Convert to lower case
let normalized = name.toLowerCase();
// Replace non-alphanumeric characters with underscores
normalized = normalized.replace(/[^a-z0-9]/g, '_');
// Replace multiple underscores with a single one
normalized = normalized.replace(/_+/g, '_');
// Trim leading and trailing underscores
normalized = normalized.replace(/^_+|_+$/g, '');
return normalized;
}
// --------------------------------------------------------------- //
// Main Script Execution
// --------------------------------------------------------------- //
/**
* Main function to execute the script.
* Processes each HTML file and extracts tables to CSV format.
*/
async function main() {
const htmlFiles = getHtmlFiles(argv.input);
htmlFiles.forEach(file => {
const htmlContent = readHtmlFile(file);
const $ = cheerio.load(htmlContent);
$('table').each((index, table) => {
// Remove unwanted tags inside the table
$(table).find('script, style, noscript').remove();
// Find preceding header element or an element with a header class
let tableName = '';
const headerSelector = argv.headerSelectors.join(', ');
const prevHeader = $(table).prevAll(headerSelector).first();
if (prevHeader.length) {
const headerText = prevHeader.text().split(argv.tableNameSeparator)[0];
tableName = normalizeColumnName(headerText); // Extract text and normalize
}
let headers = [];
const tableData = [];
let headerExtracted = false;
let allHeaders = new Set(); // Declare allHeaders here
$('tr', table).each((rowIndex, row) => {
// Skip rows until headers are found
if (!headerExtracted) {
if ($('th', row).length > 0) {
$('th', row).each((headerIndex, header) => {
let headerText = $(header).text().trim();
headers.push(normalizeColumnName(headerText)); // Normalize the column name
});
headerExtracted = true;
}
return; // Skip to the next row
}
// Process data rows
const rowData = {};
$('td', row).each((colIndex, cell) => {
// Extract text content from the cell
const cellText = $(cell).text().trim();
const baseColumnName = headers[colIndex] || `Column${colIndex}`;
rowData[`${baseColumnName}_content`] = cellText;
allHeaders.add(`${baseColumnName}_content`);
// Extract specified attributes from the cell
argv.attributes.forEach(attribute => {
const attrValue = $(cell).attr(attribute);
if (attrValue) {
const attrColumnName = `${baseColumnName}_${attribute}`;
rowData[attrColumnName] = attrValue;
allHeaders.add(attrColumnName);
}
});
// Extract specified elements and their text/attributes
argv.elements.forEach(element => {
$(cell).find(element).each((elIndex, el) => {
const elText = $(el).text().trim();
const elColumnName = `${baseColumnName}_${element}${elIndex}_content`;
rowData[elColumnName] = elText;
allHeaders.add(elColumnName);
argv.attributes.forEach(attribute => {
const elAttrValue = $(el).attr(attribute);
if (elAttrValue) {
const elAttrColumnName = `${baseColumnName}_${element}${elIndex}_${attribute}`;
rowData[elAttrColumnName] = elAttrValue;
allHeaders.add(elAttrColumnName);
}
});
});
});
});
if (Object.keys(rowData).length > 0) {
tableData.push(rowData);
}
});
// Convert Set to Array for headers
const headersArray = Array.from(allHeaders);
if (tableData.length > 0) {
const parser = new Parser({ fields: headersArray });
const csv = parser.parse(tableData);
// Define output file path
const baseOutputFileName = path.basename(file, '.html');
const outputFileName = tableName ?
`${baseOutputFileName}.table_${index}.${tableName}.csv` :
`${baseOutputFileName}.table_${index}.csv`;
const outputFilePath = path.join(argv.output, outputFileName);
// Write CSV to file
fs.writeFileSync(outputFilePath, csv);
console.log(`Table ${index} from ${file} saved to ${outputFilePath}`);
} else {
console.log(`No data found in Table ${index} from ${file}`);
}
});
});
};
main().catch(error => {
console.error(error);
process.exit(1);
});