This repository has been archived by the owner on Jun 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
index.js
87 lines (71 loc) · 2.98 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset";
import { OpenAIEmbeddings } from "@langchain/openai";
import { Document } from "@langchain/core/documents";
import { Actor, log } from 'apify';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { Pinecone } from "@pinecone-database/pinecone";
import { PineconeStore } from "@langchain/pinecone"
// Attribute API activity to Apify
const SOURCE_TAG = "apify";
function getNestedValue(dataDict, keysStr) {
const keys = keysStr.split('.');
let result = dataDict;
for (const key of keys) {
if (key in result) {
result = result[key];
} else {
// If any of the keys are not found, return null
return null;
}
}
return result;
}
await Actor.init();
const actorInput = await Actor.getInput();
const { index_name, metadata_fields, metadata_values, pinecone_token, openai_token, fields = [], chunk_size, chunk_overlap } = actorInput;
const datasetId = actorInput?.payload?.resource?.defaultDatasetId || actorInput.dataset_id
if (!datasetId) {
msg = "The input parameter dataset_id is required"
Actor.fail(msg);
throw new Error(msg);
}
for (const field of fields) {
const loader = new ApifyDatasetLoader(datasetId, {
datasetMappingFunction: (datasetItem) =>
new Document({
pageContent: getNestedValue(datasetItem, field),
metadata: {
...metadata_values,
...Object.entries(metadata_fields || {}).reduce(
(acc, [key, value]) => {
acc[key] = getNestedValue(datasetItem, value);
return acc;
}, {}
)
},
}),
});
console.log("Loading documents from Apify for field", {field});
const documents = await loader.load();
console.log(`Split documents into chunks with chunkSize: ${chunk_size} and ${chunk_overlap}`);
const textSplitter = new RecursiveCharacterTextSplitter({chunkSize: chunk_size, chunkOverlap: chunk_overlap})
const docs = await textSplitter.splitDocuments(documents)
console.log(`Created ${docs.length} chunks.`);
console.log("Initializing Pinecone");
const pinecone = new Pinecone({ apiKey: pinecone_token, sourceTag: SOURCE_TAG });
const pineconeIndex = pinecone.index(index_name)
try {
await PineconeStore.fromDocuments(docs, new OpenAIEmbeddings({ openAIApiKey: openai_token }), {
index_name,
maxConcurrency: 5, // Maximum number of batch requests to allow at once. Each batch is 1000 vectors.
pineconeIndex,
});
console.log("Documents insert into Pinecone");
await Actor.exit();
} catch (e) {
const errorMessage = `Index creation failed: ${e}`
console.log(errorMessage);
await Actor.setStatusMessage(errorMessage)
await Actor.fail()
}
}