-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.js
60 lines (50 loc) · 1.61 KB
/
utils.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
const { encode, decode } = require('gpt-3-encoder');
var MAX_TOKENS = 2000
function countTokens(text) {
const tokens = encode(text);
return tokens.length;
}
// Function to split text into chunks based on token count
function splitTextIntoChunks(text, numTokens) {
if (!numTokens)
numTokens = MAX_TOKENS
const chunks = [];
let currentChunk = '';
let tokens = 0;
const paragraphs = text.split(/\n\n+/);
for (const paragraph of paragraphs) {
// Count the number of tokens in the paragraph
const paragraphTokens = countTokens(paragraph);
// console.log("patagraph tokens: ", paragraphTokens);
// If the paragraph would push the token count over the limit, split it into sentences and add each sentence to a new chunk
if (tokens + paragraphTokens > numTokens) {
const sentences = paragraph.split(/[.!?]+/);
for (const sentence of sentences) {
const sentenceTokens = countTokens(sentence);
if (tokens + sentenceTokens > numTokens) {
chunks.push(currentChunk);
// console.log('Chunk created (sentence split):', currentChunk);
// console.log("");
currentChunk = sentence;
tokens = sentenceTokens;
} else {
currentChunk += sentence;
tokens += sentenceTokens;
}
}
} else {
// Otherwise, add the paragraph to the current chunk
currentChunk += paragraph;
tokens += paragraphTokens;
}
}
// Add the final chunk to the array of chunks
if (currentChunk) {
chunks.push(currentChunk);
}
return chunks;
}
module.exports = {
countTokens,
splitTextIntoChunks
}