Skip to content

Commit

Permalink
chore: move postgres storage classes to @llamaindex/postgres (#1597)
Browse files Browse the repository at this point in the history
  • Loading branch information
thucpn authored Jan 20, 2025
1 parent d6c270e commit 9456616
Show file tree
Hide file tree
Showing 54 changed files with 674 additions and 505 deletions.
8 changes: 8 additions & 0 deletions .changeset/polite-coats-return.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"@llamaindex/e2e": patch
"@llamaindex/core": patch
"llamaindex": patch
"pg-vector-store": patch
---

refactor: @llamaindex/postgres
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:
done
- name: Pack provider packages
run: |
for dir in packages/providers/*; do
for dir in packages/providers/* packages/providers/storage/*; do
if [ -d "$dir" ] && [ -f "$dir/package.json" ]; then
echo "Packing $dir"
pnpm pack --pack-destination ${{ runner.temp }} -C $dir
Expand Down
3 changes: 1 addition & 2 deletions e2e/node/vector-store/pg-vector-store.e2e.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { config } from "dotenv";
import { Document, VectorStoreQueryMode } from "llamaindex";
import { PGVectorStore } from "llamaindex/vector-store/PGVectorStore";
import { Document, PGVectorStore, VectorStoreQueryMode } from "llamaindex";
import assert from "node:assert";
import { test } from "node:test";
import pg from "pg";
Expand Down
21 changes: 19 additions & 2 deletions examples/vector-store/pg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,31 @@ Read and follow the instructions in the README.md file located one directory up

To import documents and save the embedding vectors to your database:

> `npx tsx pg-vector-store/load-docs.ts data`
> `npx tsx vector-store/pg/load-docs.ts data`
where data is the directory containing your input files. Using the `data` directory in the example above will read all of the files in that directory using the LlamaIndexTS default readers for each file type.

## RAG Querying

To query using the resulting vector store:

> `npx tsx pg-vector-store/query.ts`
> `npx tsx vector-store/pg/query.ts`
The script will prompt for a question, then process and present the answer using the PGVectorStore data and your OpenAI API key. It will continue to prompt until you enter `q`, `quit` or `exit` as the next query.

## Supabase

You can try the supabase example by running:

> `npx tsx vector-store/pg/supabase.ts`
This will use the `POSTGRES_URL` environment variable to connect to your Supabase database.
Get one from the Supabase project settings page. See more details here: https://supabase.com/docs/guides/database/connecting-to-postgres#direct-connection

## Vercel

You can try the vercel example by running:

> `npx tsx vector-store/pg/vercel.ts`
For more information on Vercel Postgres, see: https://vercel.com/docs/storage/vercel-postgres/sdk
34 changes: 34 additions & 0 deletions examples/vector-store/pg/supabase.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import dotenv from "dotenv";
import {
PGVectorStore,
SimpleDirectoryReader,
storageContextFromDefaults,
VectorStoreIndex,
} from "llamaindex";

dotenv.config();

// Get direct connection string from Supabase and set it as POSTGRES_URL environment variable
// https://supabase.com/docs/guides/database/connecting-to-postgres#direct-connection

const sourceDir = "../data";
const connectionString = process.env.POSTGRES_URL;

const rdr = new SimpleDirectoryReader();
const docs = await rdr.loadData({ directoryPath: sourceDir });
const pgvs = new PGVectorStore({ clientConfig: { connectionString } });
pgvs.setCollection(sourceDir);

const ctx = await storageContextFromDefaults({ vectorStore: pgvs });

const index = await VectorStoreIndex.fromDocuments(docs, {
storageContext: ctx,
});

const queryEngine = index.asQueryEngine();

const results = await queryEngine.query({
query: "Information about the planet",
});

console.log(results);
15 changes: 15 additions & 0 deletions packages/core/src/global/settings.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { getEnv } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
import type { BaseEmbedding } from "../embeddings";
import type { LLM } from "../llms";
import {
type CallbackManager,
Expand All @@ -12,6 +13,11 @@ import {
setChunkSize,
withChunkSize,
} from "./settings/chunk-size";
import {
getEmbeddedModel,
setEmbeddedModel,
withEmbeddedModel,
} from "./settings/embedModel";
import { getLLM, setLLM, withLLM } from "./settings/llm";
import {
getTokenizer,
Expand All @@ -29,6 +35,15 @@ export const Settings = {
withLLM<Result>(llm: LLM, fn: () => Result): Result {
return withLLM(llm, fn);
},
get embedModel() {
return getEmbeddedModel();
},
set embedModel(embedModel) {
setEmbeddedModel(embedModel);
},
withEmbedModel<Result>(embedModel: BaseEmbedding, fn: () => Result): Result {
return withEmbeddedModel(embedModel, fn);
},
get tokenizer() {
return getTokenizer();
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import type { BaseEmbedding } from "@llamaindex/core/embeddings";
import { AsyncLocalStorage } from "@llamaindex/env";
import { OpenAIEmbedding } from "@llamaindex/openai";

const embeddedModelAsyncLocalStorage = new AsyncLocalStorage<BaseEmbedding>();
let globalEmbeddedModel: BaseEmbedding | null = null;

export function getEmbeddedModel(): BaseEmbedding {
if (globalEmbeddedModel === null) {
globalEmbeddedModel = new OpenAIEmbedding();
const currentEmbeddedModel =
embeddedModelAsyncLocalStorage.getStore() ?? globalEmbeddedModel;
if (!currentEmbeddedModel) {
throw new Error(
"Cannot find Embedding, please set `Settings.embedModel = ...` on the top of your code",
);
}
return embeddedModelAsyncLocalStorage.getStore() ?? globalEmbeddedModel;
return currentEmbeddedModel;
}

export function setEmbeddedModel(embeddedModel: BaseEmbedding) {
Expand Down
167 changes: 167 additions & 0 deletions packages/core/src/storage/doc-store/base-document-store.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import { path } from "@llamaindex/env";
import {
DEFAULT_DOC_STORE_PERSIST_FILENAME,
DEFAULT_PERSIST_DIR,
} from "../../global";
import type { StoredValue } from "../../schema";
import { BaseNode, Document, ObjectType, TextNode } from "../../schema";

const TYPE_KEY = "__type__";
const DATA_KEY = "__data__";

export interface Serializer<T> {
toPersistence(data: Record<string, unknown>): T;

fromPersistence(data: T): Record<string, unknown>;
}

export const jsonSerializer: Serializer<string> = {
toPersistence(data) {
return JSON.stringify(data);
},
fromPersistence(data) {
return JSON.parse(data);
},
};

export const noneSerializer: Serializer<Record<string, unknown>> = {
toPersistence(data) {
return data;
},
fromPersistence(data) {
return data;
},
};

type DocJson<Data> = {
[TYPE_KEY]: ObjectType;
[DATA_KEY]: Data;
};

export function isValidDocJson(
docJson: StoredValue | null | undefined,
): docJson is DocJson<unknown> {
return (
typeof docJson === "object" &&
docJson !== null &&
docJson[TYPE_KEY] !== undefined &&
docJson[DATA_KEY] !== undefined
);
}

export function docToJson(
doc: BaseNode,
serializer: Serializer<unknown>,
): DocJson<unknown> {
return {
[DATA_KEY]: serializer.toPersistence(doc.toJSON()),
[TYPE_KEY]: doc.type,
};
}

export function jsonToDoc<Data>(
docDict: DocJson<Data>,
serializer: Serializer<Data>,
): BaseNode {
const docType = docDict[TYPE_KEY];
// fixme: zod type check this
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const dataDict: any = serializer.fromPersistence(docDict[DATA_KEY]);
let doc: BaseNode;

if (docType === ObjectType.DOCUMENT) {
doc = new Document({
text: dataDict.text,
id_: dataDict.id_,
embedding: dataDict.embedding,
hash: dataDict.hash,
metadata: dataDict.metadata,
});
} else if (docType === ObjectType.TEXT) {
doc = new TextNode({
text: dataDict.text,
id_: dataDict.id_,
hash: dataDict.hash,
metadata: dataDict.metadata,
relationships: dataDict.relationships,
});
} else {
throw new Error(`Unknown doc type: ${docType}`);
}

return doc;
}

const DEFAULT_PERSIST_PATH = path.join(
DEFAULT_PERSIST_DIR,
DEFAULT_DOC_STORE_PERSIST_FILENAME,
);

export interface RefDocInfo {
nodeIds: string[];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
extraInfo: Record<string, any>;
}

export abstract class BaseDocumentStore {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
serializer: Serializer<any> = jsonSerializer;

// Save/load
persist(persistPath: string = DEFAULT_PERSIST_PATH): void {
// Persist the docstore to a file.
}

// Main interface
abstract docs(): Promise<Record<string, BaseNode>>;

abstract addDocuments(docs: BaseNode[], allowUpdate: boolean): Promise<void>;

abstract getDocument(
docId: string,
raiseError: boolean,
): Promise<BaseNode | undefined>;

abstract deleteDocument(docId: string, raiseError: boolean): Promise<void>;

abstract documentExists(docId: string): Promise<boolean>;

// Hash
abstract setDocumentHash(docId: string, docHash: string): Promise<void>;

abstract getDocumentHash(docId: string): Promise<string | undefined>;

abstract getAllDocumentHashes(): Promise<Record<string, string>>;

// Ref Docs
abstract getAllRefDocInfo(): Promise<Record<string, RefDocInfo> | undefined>;

abstract getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined>;

abstract deleteRefDoc(refDocId: string, raiseError: boolean): Promise<void>;

// Nodes
getNodes(nodeIds: string[], raiseError: boolean = true): Promise<BaseNode[]> {
return Promise.all(
nodeIds.map((nodeId) => this.getNode(nodeId, raiseError)),
);
}

async getNode(nodeId: string, raiseError: boolean = true): Promise<BaseNode> {
const doc = await this.getDocument(nodeId, raiseError);
if (!(doc instanceof BaseNode)) {
throw new Error(`Document ${nodeId} is not a Node.`);
}
return doc;
}

async getNodeDict(nodeIdDict: {
[index: number]: string;
}): Promise<Record<number, BaseNode>> {
const result: Record<number, BaseNode> = {};
for (const index in nodeIdDict) {
result[index] = await this.getNode(nodeIdDict[index]!);
}
return result;
}
}
Loading

0 comments on commit 9456616

Please sign in to comment.