-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
117 lines (101 loc) · 3.08 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import { UnstructuredClient } from "unstructured-client";
import { readdir, readFile, writeFile } from "node:fs/promises";
import { createLogger, format, transports } from "winston";
import { fileURLToPath } from "url";
import path from "path";
import "dotenv/config";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const url = process.env.UNSTRUCTURED_API_URL;
const key = process.env.UNSTRUCTURED_API_KEY;
const strategy = process.env.STRATEGY || "fast";
const splitPages = process.env.SPLIT_PAGES === "true" ? true : false;
const docsPath =
process.env.DOCS_PATH || path.resolve(__dirname, "sample_data");
const outputPath = process.env.OUTPUT_PATH || path.resolve(__dirname, "output");
const logger = createLogger({
level: "info",
format: format.json(),
transports: [
new transports.File({ filename: "error.log", level: "error" }),
new transports.File({ filename: "combined.log" }),
],
});
const getFilesWithPath = async (docsPath) => {
const files = await readdir(docsPath);
return files.map((file) => `${docsPath}/${file}`);
};
const getFileData = async (filePath) => {
try {
const fileBuffer = await readFile(filePath);
return fileBuffer;
} catch (e) {
throw new Error(`Failed to read ${filePath}`);
}
};
const createCliet = (url, key) =>
new UnstructuredClient({
serverURL: url,
security: {
apiKeyAuth: key,
},
});
const partition = async (data, file, client) =>
await client.general.partition({
partitionParameters: {
files: {
content: data,
fileName: file,
},
strategy: strategy,
splitPdfPage: splitPages,
},
});
const writeResults = async (outputPath, file, res) => {
const outputName = file.split("/").pop();
await writeFile(
`${outputPath}/${outputName}.json`,
JSON.stringify(res.elements),
(err) => console.info(err)
);
return true;
};
const processFile = async (file) => {
try {
logger.profile(`Partitioning ${file}`);
const client = createCliet(url, key);
const fileBuffer = await getFileData(file);
const partitionedFile = await partition(fileBuffer, file, client);
const writePartitionResults = await writeResults(
outputPath,
file,
partitionedFile
);
if (writePartitionResults) {
logger.profile(`Partitioning ${file}`);
logger.info(`Partitioning ${file} complete`);
}
} catch (e) {
logger.error(`Failed to process ${file}`, e);
}
};
async function main() {
try {
logger.profile(
`Overall Runtime with strategy ${strategy} and split pages ${splitPages}`
);
const files = await getFilesWithPath(docsPath);
const promises = files.map((file) => processFile(file));
await Promise.all(promises);
const outputFiles = await getFilesWithPath(outputPath);
logger.profile(
`Overall Runtime with strategy ${strategy} and split pages ${splitPages}`
);
logger.info(
`input files: ${files.length} output files: ${outputFiles.length}`
);
} catch (e) {
logger.error("Failed to process files", e);
}
}
main();