-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add basic struct. and func. of SQL parsing
- Update dependencies - add sqlparse - Set new micro version - 0.0.3 - Add SQL parser - Add table abstraction to hold necessary data - Add exceptions - table not found in catalog - Add some preliminary tests
- Loading branch information
Showing
10 changed files
with
291 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
class TableNotInCatalogException(Exception): | ||
""" | ||
The specified table is not registered in data catalog | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import sqlparse | ||
|
||
from duckberg.table import TableWithAlias | ||
from pyiceberg.expressions import * | ||
from pyiceberg.expressions import parser | ||
|
||
|
||
class DuckBergSQLParser: | ||
def parse_first_query(self, sql: str) -> sqlparse.sql.Statement: | ||
reformated_sql = sql.replace("'", '"') # replace all single quotes with double quotes | ||
return sqlparse.parse(reformated_sql)[0] | ||
|
||
def unpack_identifiers(self, token: sqlparse.sql.IdentifierList) -> list[TableWithAlias]: | ||
return list( | ||
map( | ||
lambda y: TableWithAlias.from_identifier(y), | ||
filter(lambda x: type(x) is sqlparse.sql.Identifier, token.tokens), | ||
) | ||
) | ||
|
||
def extract_tables(self, parsed_sql: sqlparse.sql.Statement) -> list[TableWithAlias]: | ||
tables = [] | ||
get_next = 0 | ||
c_table: list[TableWithAlias] = [] | ||
c_table_wc = None | ||
for token in parsed_sql.tokens: | ||
if get_next == 1 and token.ttype is not sqlparse.tokens.Whitespace: | ||
if type(token) is sqlparse.sql.Identifier: | ||
c_table = [TableWithAlias.from_identifier(token)] | ||
get_next += 1 | ||
elif type(token) is sqlparse.sql.IdentifierList: | ||
c_table = self.unpack_identifiers(token) | ||
get_next += 1 | ||
elif type(token) is sqlparse.sql.Parenthesis: | ||
tables.extend(self.extract_tables(token)) | ||
|
||
if token.ttype is sqlparse.tokens.Keyword and str(token.value).upper() == "FROM": | ||
get_next += 1 | ||
|
||
if type(token) is sqlparse.sql.Where: | ||
c_table_wc = self.extract_where_conditions(token) | ||
|
||
mapped_c_table = list(map(lambda x: x.set_comparisons(c_table_wc), c_table)) | ||
tables.extend(mapped_c_table) | ||
return tables | ||
|
||
def extract_where_conditions(self, where_statement: list[sqlparse.sql.Where]): | ||
comparison = sqlparse.sql.TokenList(where_statement[1:]) | ||
return parser.parse(str(comparison)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from pyiceberg.catalog import Catalog | ||
from pyiceberg.io import FileIO | ||
from pyiceberg.table import Table | ||
from pyiceberg.table.metadata import TableMetadata | ||
from pyiceberg.typedef import Identifier | ||
from pyiceberg.expressions import BooleanExpression | ||
import sqlparse | ||
|
||
|
||
class DuckBergTable(Table): | ||
""" | ||
Class for storing precomputed data for faster processing of queries | ||
""" | ||
|
||
def __init__( | ||
self, identifier: Identifier, metadata: TableMetadata, metadata_location: str, io: FileIO, catalog: Catalog | ||
) -> None: | ||
super().__init__(identifier, metadata, metadata_location, io, catalog) | ||
self.partitions = None | ||
|
||
@classmethod | ||
def from_pyiceberg_table(cls, table: Table): | ||
return cls(table.identifier, table.metadata, table.metadata_location, table.io, table.catalog) | ||
|
||
def precomp_partitions(self): | ||
if self.spec().is_unpartitioned(): | ||
self.partitions = [] | ||
|
||
partition_cols_ids = [p["source-id"] for p in self.spec().model_dump()["fields"]] | ||
self.partitions = [c["name"] for c in self.schema().model_dump()["fields"] if c["id"] in partition_cols_ids] | ||
|
||
def __repr__(self) -> str: | ||
return self.table | ||
|
||
|
||
class TableWithAlias: | ||
""" | ||
Dataclass contains table name with alias | ||
""" | ||
|
||
def __init__(self, tname: str, talias: str) -> None: | ||
self.table_name: str = tname | ||
self.table_alias: str = talias | ||
self.comparisons: BooleanExpression = None | ||
|
||
@classmethod | ||
def from_identifier(cls, identf: sqlparse.sql.Identifier): | ||
return cls(identf.get_real_name(), identf.get_alias()) | ||
|
||
def set_comparisons(self, comparisons: BooleanExpression): | ||
self.comparisons = comparisons | ||
return self | ||
|
||
def __str__(self) -> str: | ||
return f"{self.table_name} ({self.table_alias})" | ||
|
||
def __repr__(self) -> str: | ||
return self.__str__() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from duckberg.sqlparser import DuckBergSQLParser | ||
|
||
|
||
parser = DuckBergSQLParser() | ||
|
||
|
||
sql1 = """ | ||
SELECT * FROM this_is_awesome_table""" | ||
sql1_parsed = parser.parse_first_query(sql=sql1) | ||
res1 = parser.extract_tables(sql1_parsed) | ||
assert len(res1) == 1 | ||
assert list(map(lambda x: str(x), res1)) == ["this_is_awesome_table (None)"] | ||
|
||
sql2 = """ | ||
SELECT * FROM this_is_awesome_table, second_awesome_table""" | ||
sql2_parsed = parser.parse_first_query(sql=sql2) | ||
res2 = parser.extract_tables(sql2_parsed) | ||
assert len(res2) == 2 | ||
assert list(map(lambda x: str(x), res2)) == ["this_is_awesome_table (None)", "second_awesome_table (None)"] | ||
|
||
sql3 = """ | ||
SELECT * FROM (SELECT * FROM (SELECT * FROM this_is_awesome_table))""" | ||
sql3_parsed = parser.parse_first_query(sql=sql3) | ||
res3 = parser.extract_tables(sql3_parsed) | ||
assert len(res3) == 1 | ||
assert list(map(lambda x: str(x), res3)) == ["this_is_awesome_table (None)"] | ||
|
||
sql4 = """ | ||
SELECT * FROM (SELECT * FROM (SELECT * FROM this_is_awesome_table), second_awesome_table)""" | ||
sql4_parsed = parser.parse_first_query(sql=sql4) | ||
res4 = parser.extract_tables(sql4_parsed) | ||
assert len(res4) == 2 | ||
assert list(map(lambda x: str(x), res4)) == ["this_is_awesome_table (None)", "second_awesome_table (None)"] | ||
|
||
sql5 = """ | ||
SELECT * FROM (SELECT * FROM (SELECT * FROM this_is_awesome_table tiat, second_awesome_table))""" | ||
sql5_parsed = parser.parse_first_query(sql=sql5) | ||
res5 = parser.extract_tables(sql5_parsed) | ||
assert len(res5) == 2 | ||
assert list(map(lambda x: str(x), res5)) == ["this_is_awesome_table (tiat)", "second_awesome_table (None)"] |
Oops, something went wrong.