Skip to content

Commit

Permalink
add notebook
Browse files Browse the repository at this point in the history
Signed-off-by: cmuhao <[email protected]>
  • Loading branch information
HaoXuAI committed Apr 17, 2024
1 parent e8c3882 commit c3a0117
Show file tree
Hide file tree
Showing 10 changed files with 3,785 additions and 5 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ terraform.tfstate.backup
.vscode/*
**/derby.log
**/metastore_db/*
.env
.env
.idea
1 change: 1 addition & 0 deletions module_4_rag/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/*
1 change: 1 addition & 0 deletions module_4_rag/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.9
8 changes: 5 additions & 3 deletions module_4_rag/batch_score_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import torch
import torch.nn.functional as F

INPUT_FILENAME = "city_wikipedia_summaries.csv"
EXPORT_FILENAME = "city_wikipedia_summaries_with_embeddings.csv"
INPUT_FILENAME = "./data/city_wikipedia_summaries.csv"
EXPORT_FILENAME = "./data/city_wikipedia_summaries_with_embeddings.parquet"
TOKENIZER = 'sentence-transformers/all-MiniLM-L6-v2'
MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

Expand Down Expand Up @@ -35,8 +35,10 @@ def score_data() -> None:
print('shape = ', df.shape)
df['Embeddings'] = list(embeddings.detach().cpu().numpy())
print("embeddings generated...")
df['event_timestamp'] = pd.to_datetime('today')
df["item_id"] = df.index
print(df.head())
df.to_csv(EXPORT_FILENAME, index=False)
df.to_parquet(EXPORT_FILENAME, index=False)
print("...data exported. job complete")
else:
print("scored data found...skipping generating embeddings.")
Expand Down
3 changes: 3 additions & 0 deletions module_4_rag/feature_repo/entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from feast import Entity

item = Entity(name="item_id")
20 changes: 20 additions & 0 deletions module_4_rag/feature_repo/feature_store.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
project: feast_demo_local
provider: local
registry:
registry_type: sql
path: postgresql://@localhost:5432/feast
online_store:
type: postgres
pgvector_enabled: true
vector_len: 384
host: 127.0.0.1
port: 5432
database: feast
user: ""
password: ""


offline_store:
type: file
entity_key_serialization_version: 2

28 changes: 28 additions & 0 deletions module_4_rag/feature_repo/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from datetime import timedelta

from feast import (
FeatureView,
Field, FileSource,
)
from feast.data_format import ParquetFormat
from feast.types import Float32, Array
from entities import item


parquet_file_path = "../data/city_wikipedia_summaries_with_embeddings.parquet"

source = FileSource(
file_format=ParquetFormat(),
path=parquet_file_path,
timestamp_field="event_timestamp",
)

city_embeddings_feature_view = FeatureView(
name="city_embeddings",
entities=[item],
schema=[
Field(name="Embeddings", dtype=Array(Float32)),
],
source=source,
ttl=timedelta(hours=2),
)
354 changes: 354 additions & 0 deletions module_4_rag/feature_repo/module_1.ipynb

Large diffs are not rendered by default.

3,370 changes: 3,370 additions & 0 deletions module_4_rag/poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion module_4_rag/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ packages = [{include = "feast_rag"}]

[tool.poetry.dependencies]
python = "^3.9"
feast = "^0.35.0"
feast = "^0.37.0"
torch = "^2.2.0"
flasgger = "^0.9.7.1"
wikipedia = "^1.4.0"
Expand Down

0 comments on commit c3a0117

Please sign in to comment.