Skip to content

Commit

Permalink
Separated database initialization logic by role
Browse files Browse the repository at this point in the history
  • Loading branch information
arueth committed Jan 24, 2025
1 parent 70ef0eb commit b7dd5b8
Show file tree
Hide file tree
Showing 10 changed files with 400 additions and 160 deletions.
59 changes: 44 additions & 15 deletions use-cases/rag-pipeline/alloy-db-setup/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,28 +78,33 @@ MLP accounts MLP_DB_ADMIN_IAM and MLP_DB_USER_IAM need Storage object permission
```sh
export CATALOG_DB="product_catalog"
export CATALOG_TABLE_NAME="clothes"
export DB_READ_USERS="${MLP_DB_USER_IAM}"
export DB_WRITE_USERS="${MLP_DB_USER_IAM}"
export EMBEDDING_COLUMN_IMAGE="image_embeddings"
export EMBEDDING_COLUMN_MULTIMODAL="multimodal_embeddings"
export EMBEDDING_COLUMN_TEXT="text_embeddings"
export EMBEDDING_DIMENSION="\"768\""
export IMAGE_EMBEDDING_ENDPOINT="http://multimodal-embedding-model.ml-team:80/image_embeddings"
export EMBEDDING_ENDPOINT_IMAGE="http://multimodal-embedding-model.ml-team:80/image_embeddings"
export MASTER_CATALOG_FILE_NAME="master_product_catalog.csv"
export MULTIMODAL_EMBEDDING_ENDPOINT="http://multimodal-embedding-model.ml-team:80/multimodal_embeddings"
export EMBEDDING_ENDPOINT_MULTIMODAL="http://multimodal-embedding-model.ml-team:80/multimodal_embeddings"
export NUM_LEAVES_VALUE="\"300\""
export TEXT_EMBEDDING_ENDPOINT="http://multimodal-embedding-model.ml-team:80/text_embeddings"
export EMBEDDING_ENDPOINT_TEXT="http://multimodal-embedding-model.ml-team:80/text_embeddings"
```

```sh
git restore manifests/alloydb-setup-job.yaml
git restore manifests/job-initialize-database.yaml manifests/job-populate-table.yaml
sed \
-i -e "s|V_CATALOG_DB|${CATALOG_DB}|" \
-i -e "s|V_CATALOG_TABLE_NAME|${CATALOG_TABLE_NAME}|" \
-i -e "s|V_DB_ADMIN_KSA|${MLP_DB_ADMIN_KSA}|" \
-i -e "s|V_DB_READ_USERS|${DB_READ_USERS}|" \
-i -e "s|V_DB_USER_KSA|${MLP_DB_USER_KSA}|" \
-i -e "s|V_DB_WRITE_USERS|${DB_WRITE_USERS}|" \
-i -e "s|V_IMAGE|${MLP_DB_SETUP_IMAGE}|" \
-i -e "s|V_KSA|${MLP_DB_ADMIN_KSA}|" \
-i -e "s|V_EMBEDDING_DIMENSION|${EMBEDDING_DIMENSION}|" \
-i -e "s|V_EMBEDDING_ENDPOINT_IMAGE|${IMAGE_EMBEDDING_ENDPOINT}|" \
-i -e "s|V_EMBEDDING_ENDPOINT_MULTIMODAL|${MULTIMODAL_EMBEDDING_ENDPOINT}|" \
-i -e "s|V_EMBEDDING_ENDPOINT_TEXT|${TEXT_EMBEDDING_ENDPOINT}|" \
-i -e "s|V_EMBEDDING_ENDPOINT_IMAGE|${EMBEDDING_ENDPOINT_IMAGE}|" \
-i -e "s|V_EMBEDDING_ENDPOINT_MULTIMODAL|${EMBEDDING_ENDPOINT_MULTIMODAL}|" \
-i -e "s|V_EMBEDDING_ENDPOINT_TEXT|${EMBEDDING_ENDPOINT_TEXT}|" \
-i -e "s|V_EMBEDDING_COLUMN_TEXT|${EMBEDDING_COLUMN_TEXT}|" \
-i -e "s|V_EMBEDDING_COLUMN_IMAGE|${EMBEDDING_COLUMN_IMAGE}|" \
-i -e "s|V_EMBEDDING_COLUMN_MULTIMODAL|${EMBEDDING_COLUMN_MULTIMODAL}|" \
Expand All @@ -110,32 +115,56 @@ MLP accounts MLP_DB_ADMIN_IAM and MLP_DB_USER_IAM need Storage object permission
-i -e "s|V_NUM_LEAVES_VALUE|${NUM_LEAVES_VALUE}|" \
-i -e "s|V_PROCESSED_DATA_BUCKET|${MLP_DATA_BUCKET}|" \
-i -e "s|V_PROJECT_ID|${MLP_PROJECT_ID}|" \
manifests/alloydb-setup-job.yaml
manifests/job-initialize-database.yaml \
manifests/job-populate-table.yaml
```

- Create the job.
- Create the initialize database job.

```
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} apply -f manifests/alloydb-setup-job.yaml
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} apply -f manifests/job-initialize-database.yaml
```

> The job runs for about two hours
- Check the status of the job.

```
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} get job/initialize-database
```

- Watch the job until it is complete.

```
watch --color --interval 5 --no-title \
"kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} get job/initialize-database | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'"
```

- Check logs for any errors.

```
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} logs job/initialize-database
```

- Create the populate table job.

```
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} apply -f manifests/job-populate-table.yaml
```

- Check the status of the job.

```
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} get job/alloydb-setup
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} get job/populate-table
```

- Watch the job until it is complete.

```
watch --color --interval 5 --no-title \
"kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} get job/alloydb-setup | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'"
"kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} get job/populate-table | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'"
```

- Check logs for any errors.

```
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} logs job/alloydb-setup
kubectl --namespace ${MLP_KUBERNETES_NAMESPACE} logs job/populate-table
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: batch/v1
kind: Job
metadata:
name: initialize-database
spec:
template:
spec:
serviceAccountName: V_DB_ADMIN_KSA
containers:
- command:
- python
- -m
- db_setup.initialize_database
env:
- name: CATALOG_DB
value: V_CATALOG_DB
- name: CATALOG_TABLE_NAME
value: V_CATALOG_TABLE_NAME
- name: DB_READ_USERS
value: V_DB_READ_USERS
- name: DB_WRITE_USERS
value: V_DB_WRITE_USERS
- name: EMBEDDING_COLUMN_IMAGE
value: V_EMBEDDING_COLUMN_IMAGE
- name: EMBEDDING_COLUMN_MULTIMODAL
value: V_EMBEDDING_COLUMN_MULTIMODAL
- name: EMBEDDING_COLUMN_TEXT
value: V_EMBEDDING_COLUMN_TEXT
- name: EMBEDDING_DIMENSION
value: V_EMBEDDING_DIMENSION
- name: EMBEDDING_ENDPOINT_IMAGE
value: V_EMBEDDING_ENDPOINT_IMAGE
- name: EMBEDDING_ENDPOINT_MULTIMODAL
value: V_EMBEDDING_ENDPOINT_MULTIMODAL
- name: EMBEDDING_ENDPOINT_TEXT
value: V_EMBEDDING_ENDPOINT_TEXT
- name: MASTER_CATALOG_FILE_NAME
value: V_MASTER_CATALOG_FILE_NAME
- name: MLP_DB_ADMIN_IAM
value: V_MLP_DB_ADMIN_IAM
- name: MLP_DB_INSTANCE_URI
value: V_MLP_DB_INSTANCE_URI
- name: MLP_KUBERNETES_NAMESPACE
value: V_MLP_KUBERNETES_NAMESPACE
- name: NUM_LEAVES_VALUE
value: V_NUM_LEAVES_VALUE
- name: PROCESSED_DATA_BUCKET
value: V_PROCESSED_DATA_BUCKET
image: V_IMAGE
imagePullPolicy: Always
name: db-setup
restartPolicy: Never
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Google LLC
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,44 +15,52 @@
apiVersion: batch/v1
kind: Job
metadata:
name: alloydb-setup
name: populate-table
spec:
template:
spec:
serviceAccountName: V_KSA
serviceAccountName: V_DB_USER_KSA
containers:
- name: alloydb-setup
image: V_IMAGE
imagePullPolicy: Always
- command:
- python
- -m
- db_setup.populate_table
env:
- name: PROCESSED_DATA_BUCKET
value: V_PROCESSED_DATA_BUCKET
- name: MASTER_CATALOG_FILE_NAME
value: V_MASTER_CATALOG_FILE_NAME
- name: CATALOG_DB
value: V_CATALOG_DB
- name: CATALOG_TABLE_NAME
value: V_CATALOG_TABLE_NAME
- name: MLP_DB_ADMIN_IAM
value: V_MLP_DB_ADMIN_IAM
- name: EMBEDDING_DIMENSION
value: V_EMBEDDING_DIMENSION
- name: EMBEDDING_COLUMN_TEXT
value: V_EMBEDDING_COLUMN_TEXT
- name: DB_READ_USERS
value: V_DB_READ_USERS
- name: DB_WRITE_USERS
value: V_DB_WRITE_USERS
- name: EMBEDDING_COLUMN_IMAGE
value: V_EMBEDDING_COLUMN_IMAGE
- name: EMBEDDING_COLUMN_MULTIMODAL
value: V_EMBEDDING_COLUMN_MULTIMODAL
- name: NUM_LEAVES_VALUE
value: V_NUM_LEAVES_VALUE
- name: MLP_DB_INSTANCE_URI
value: V_MLP_DB_INSTANCE_URI
- name: TEXT_EMBEDDING_ENDPOINT
value: V_EMBEDDING_ENDPOINT_TEXT
- name: IMAGE_EMBEDDING_ENDPOINT
- name: EMBEDDING_COLUMN_TEXT
value: V_EMBEDDING_COLUMN_TEXT
- name: EMBEDDING_DIMENSION
value: V_EMBEDDING_DIMENSION
- name: EMBEDDING_ENDPOINT_IMAGE
value: V_EMBEDDING_ENDPOINT_IMAGE
- name: MULTIMODAL_EMBEDDING_ENDPOINT
- name: EMBEDDING_ENDPOINT_MULTIMODAL
value: V_EMBEDDING_ENDPOINT_MULTIMODAL
- name: EMBEDDING_ENDPOINT_TEXT
value: V_EMBEDDING_ENDPOINT_TEXT
- name: MASTER_CATALOG_FILE_NAME
value: V_MASTER_CATALOG_FILE_NAME
- name: MLP_DB_ADMIN_IAM
value: V_MLP_DB_ADMIN_IAM
- name: MLP_DB_INSTANCE_URI
value: V_MLP_DB_INSTANCE_URI
- name: MLP_KUBERNETES_NAMESPACE
value: V_MLP_KUBERNETES_NAMESPACE
- name: NUM_LEAVES_VALUE
value: V_NUM_LEAVES_VALUE
- name: PROCESSED_DATA_BUCKET
value: V_PROCESSED_DATA_BUCKET
image: V_IMAGE
imagePullPolicy: Always
name: db-setup
restartPolicy: Never
7 changes: 7 additions & 0 deletions use-cases/rag-pipeline/alloy-db-setup/src/.gcloudignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
__pycache__/
.venv/
venv/

.gcloudignore
.python-version
cloudbuild.yaml
5 changes: 3 additions & 2 deletions use-cases/rag-pipeline/alloy-db-setup/src/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ COPY requirements.txt /workspace/
RUN pip install --no-cache-dir -r /workspace/requirements.txt

COPY alloydb_connect.py \
create_catalog.py \
database.py \
db_setup.py \
get_emb.py \
logging.conf \
table.py \
/workspace/

ENV PYTHONUNBUFFERED 1

CMD ["python", "db_setup.py"]
CMD python -m db_setup
Loading

0 comments on commit b7dd5b8

Please sign in to comment.