Skip to content

Commit

Permalink
Merge pull request #19 from loicalleyne/main
Browse files Browse the repository at this point in the history
Add support for Iceberg storage in GCS
  • Loading branch information
rakeshJn authored Sep 24, 2024
2 parents b2be32c + 945a213 commit e630d60
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 14 deletions.
45 changes: 35 additions & 10 deletions lake_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,26 @@
from streamlit.components.v1 import html
import json
import dotenv
import time
import google.auth
from google.auth.transport.requests import Request
dotenv.load_dotenv(dotenv.find_dotenv(usecwd=True)) #Use current working directory to load .env file


class LakeView():

def __init__(self):
self.catalog = catalog.load_catalog("default",
**{
'uri': os.environ.get("PYICEBERG_CATALOG__DEFAULT__URI"),
'token': os.environ.get("PYICEBERG_CATALOG__DEFAULT__TOKEN"),
's3.endpoint': os.environ.get("AWS_ENDPOINT"),
'py-io-impl': 'pyiceberg.io.fsspec.FsspecFileIO',
'warehouse': os.environ.get("PYICEBERG_CATALOG__DEFAULT__WAREHOUSE"),
})
def __init__(self):
service_account_file = os.environ.get("GCP_KEYFILE")
if service_account_file != "":
scopes = ["https://www.googleapis.com/auth/cloud-platform"]
access_token = get_gcp_access_token(service_account_file, scopes)
self.catalog = catalog.load_catalog("default",
**{
"gcs.oauth2.token-expires-at": time.mktime(access_token.expiry.timetuple()) * 1000,
"gcs.oauth2.token": access_token.token,
})
else:
self.catalog = catalog.load_catalog("default")
self.namespace_options = []

@st.dialog("Go to Table")
Expand Down Expand Up @@ -301,7 +307,26 @@ def get_partitions(self, ps, t):
c3.append(str(f.transform))
df = pd.DataFrame({"Field": c1, "Name": c2, "Transform": c3})
st.dataframe(df, hide_index = True, use_container_width=True)


def get_gcp_access_token(service_account_file, scopes):
"""
Retrieves an access token from Google Cloud Platform using service account credentials.
Args:
service_account_file: Path to the service account JSON key file.
scopes: List of OAuth scopes required for your application.
Returns:
The access token as a string.
"""

credentials, name = google.auth.load_credentials_from_file(
service_account_file, scopes=scopes)

request = Request()
credentials.refresh(request) # Forces token refresh if needed
return credentials

def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
Expand Down
13 changes: 9 additions & 4 deletions my.env
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
GCP_KEYFILE=
PYICEBERG_CATALOG__DEFAULT__URI=
PYICEBERG_CATALOG__DEFAULT__TOKEN=
PYICEBERG_CATALOG__DEFAULT__WAREHOUSE=
AWS_ENDPOINT=
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
#Provide any other Object Store and pyiceberg config your Lakehouse needs.
PYICEBERG_CATALOG__GCS__PROJECT_ID=
PYICEBERG_CATALOG__GCS__DEFAULT_BUCKET_LOCATION=
PYICEBERG_CATALOG__S3__ENDPOINT=
PYICEBERG_CATALOG__S3__ACCESS_KEY_ID=
PYICEBERG_CATALOG__S3__SECRET_ACCESS_KEY=
# If using Azure change below to PYICEBERG_CATALOG__DEFAULT__PY_IO_IMPL=pyiceberg.io.fsspec.FsspecFileIO
PYICEBERG_CATALOG__DEFAULT__PY_IO_IMPL=pyiceberg.io.pyarrow.PyArrowFileIO
#Provide any other Object Store and pyiceberg config your Lakehouse needs.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
pyiceberg[s3fs]>=0.7
pyiceberg[gcsfs]>=0.7
google.auth>=2.34.0
pandas>=2.0.0
streamlit>=1.37.0
python-dotenv

0 comments on commit e630d60

Please sign in to comment.