Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(FIR-34986): can use COPY to create table #131

Merged
merged 12 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changes/unreleased/Added-20240814-172459.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kind: Added
body: Added a way to use COPY FROM command as an alternative to EXTERNAL TABLE.
time: 2024-08-14T17:24:59.573298+01:00
3 changes: 3 additions & 0 deletions .changes/unreleased/Fixed-20240814-172534.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kind: Fixed
body: Fixed seed full refresh resolution.
time: 2024-08-14T17:25:34.986273+01:00
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ".[dev]" --no-cache-dir
python -m pip install -e ".[dev]" --no-cache-dir

- name: Setup database and engine
id: setup
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/jaffle-shop-v1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ jobs:
path: jaffle-shop

- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "dbt-firebolt/.[dev]"
cd dbt-firebolt
python -m pip install dbt-core -e .

- name: Setup database and engine
id: setup
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/jaffle-shop-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ jobs:
path: jaffle-shop

- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install "dbt-firebolt/.[dev]"
cd dbt-firebolt
python -m pip install dbt-core -e .

- name: Setup database and engine
id: setup
Expand All @@ -41,7 +42,6 @@ jobs:
account: ${{ vars.FIREBOLT_ACCOUNT }}
api-endpoint: "api.staging.firebolt.io"


- name: Run Jaffle Shop test workflow
env:
USER_NAME: ${{ secrets.FIREBOLT_CLIENT_ID_STG_NEW_IDN }}
Expand All @@ -56,5 +56,5 @@ jobs:
AWS_ACCESS_ROLE_ARN: ${{ secrets.AWS_ACCESS_ROLE_ARN }}
DBT_PROFILES_DIR: "../dbt-firebolt/.github/workflows/jaffle_shop"
working-directory: jaffle-shop
run:
run: |
../dbt-firebolt/.github/workflows/jaffle_shop/run_test_workflow.sh
3 changes: 3 additions & 0 deletions .github/workflows/jaffle_shop/run_test_workflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ if [[ -n "$AWS_ACCESS_ROLE_ARN" ]]; then
# Can't test this on FB 1.0
cp ../dbt-firebolt/.github/workflows/jaffle_shop/sources_external_tables_iam.yml models/staging/sources_external_tables.yml
dbt run-operation stage_external_sources --vars "ext_full_refresh: true"
# Test COPY INTO
cp ../dbt-firebolt/.github/workflows/jaffle_shop/sources_external_tables_copy.yml models/staging/sources_external_tables.yml
dbt run-operation stage_external_sources --vars "ext_full_refresh: true"
fi
dbt seed
dbt seed --full-refresh
Expand Down
29 changes: 29 additions & 0 deletions .github/workflows/jaffle_shop/sources_external_tables_copy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: 2

sources:
- name: s3
tables:
- name: raw_customers
external:
strategy: copy
url: "{{ env_var('SECURE_BUCKET_PATH') }}"
credentials:
aws_key_id: "{{ env_var('AWS_ACCESS_KEY_ID') }}"
aws_secret_key: "{{ env_var('AWS_ACCESS_SECRET_KEY') }}"
options:
object_pattern: '*raw_customers.csv'
type: CSV
auto_create: true
allow_column_mismatch: false
max_errors_per_file: 10
csv_options:
header: true
delimiter: ','
quote: DOUBLE_QUOTE
escape: '\'
null_string: '\\N'
empty_field_as_null: true
skip_blank_lines: true
date_format: 'YYYY-MM-DD'
timestamp_format: 'YYYY-MM-DD HH24:MI:SS'

6 changes: 2 additions & 4 deletions dbt/include/firebolt/macros/adapters.sql
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,10 @@
SELECT
table_catalog AS "database",
table_name AS "name",
table_schema AS "schema",
'{{ relation.schema }}' AS "schema",
CASE
WHEN table_type = 'BASE TABLE' THEN 'table'
WHEN table_type = 'DIMENSION' THEN 'table'
WHEN table_type = 'FACT' THEN 'table'
WHEN table_type = 'VIEW' THEN 'view'
ELSE 'table'
END AS "type"
FROM
information_schema.tables
Expand Down
4 changes: 1 addition & 3 deletions dbt/include/firebolt/macros/catalog.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@ the columns (for instance, `is_nullable` is missing) but more could be added lat
cols.column_name as column_name,
cols.data_type AS column_type,
CASE
WHEN table_type = 'BASE TABLE' THEN 'TABLE'
WHEN table_type = 'DIMENSION' THEN 'TABLE'
WHEN table_type = 'FACT' THEN 'TABLE'
WHEN table_type = 'VIEW' THEN 'VIEW'
ELSE 'TABLE'
END AS relation_type,
cols.ordinal_position as column_index
FROM
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
{% macro firebolt__create_external_table(source_node) %}
{% if source_node.external.strategy == 'copy' %}
{{ firebolt__create_with_copy_from(source_node) }}
{% else %}
{{ firebolt__create_with_external_table(source_node) }}
{% endif %}
{% endmacro %}

{% macro firebolt__create_with_external_table(source_node) %}
{%- set external = source_node.external -%}
{%- if 'partitions' in external -%}
{%- set columns = adapter.make_field_partition_pairs(source_node.columns.values(),
Expand All @@ -7,7 +15,6 @@
{%- set columns = adapter.make_field_partition_pairs(source_node.columns.values(),
[]) -%}
{%- endif -%}
-- {%- set partitions = external.partitions -%}
{%- set credentials = external.credentials -%}
{# Leaving out "IF NOT EXISTS" because this should only be called by
if no DROP IF is necessary. #}
Expand Down Expand Up @@ -39,3 +46,92 @@
{%- if external.compression -%} COMPRESSION = {{external.compression}} {%- endif %}
TYPE = {{ external.type }}
{% endmacro %}

{% macro firebolt__create_with_copy_from(source_node) %}
{# COPY FROM is only available in Firebolt 2.0. #}
{%- set external = source_node.external -%}
{%- set credentials = external.credentials -%}
{%- set options = external.options -%}
{%- set csv_options = options.csv_options -%}
{%- set error_file_credentials = options.error_file_credentials -%}

{# There are no partitions, but this formats the columns correctly. #}
{%- if 'partitions' in external -%}
{%- set columns = adapter.make_field_partition_pairs(source_node.columns.values(),
external.partitions) -%}
{%- else -%}
{%- set columns = adapter.make_field_partition_pairs(source_node.columns.values(),
[]) -%}
{%- endif -%}
COPY INTO {{source(source_node.source_name, source_node.name)}}
{%- if columns and columns | length > 0 %}
(
{%- for column in columns -%}
{{ column.name }}
{%- if column.default is not none %} DEFAULT {{ column.default }}{% endif %}
{%- if column.source_column_name is not none %} {{ '$' ~ loop.index0 }}{% endif %}
{{- ',' if not loop.last }}
{%- endfor -%}
)
{%- endif %}
FROM '{{external.url}}'
{%- if options %}
WITH
{%- if options.object_pattern %}
PATTERN = '{{options.object_pattern}}'
{%- endif %}
{%- if options.type %}
TYPE = {{ options.type }}
{%- endif %}
{%- if options.auto_create is not none %}
AUTO_CREATE = {{ options.auto_create | upper }}
{%- endif %}
{%- if options.allow_column_mismatch is not none %}
ALLOW_COLUMN_MISMATCH = {{ options.allow_column_mismatch | upper }}
{%- endif %}
{%- if options.error_file %}
ERROR_FILE = '{{ options.error_file }}'
{%- endif %}
{%- if error_file_credentials %}
ERROR_FILE_CREDENTIALS = (AWS_KEY_ID = '{{ error_file_credentials.aws_key_id }}' AWS_SECRET_KEY = '{{ error_file_credentials.aws_secret_key }}')
{%- endif %}
{%- if options.max_errors_per_file %}
MAX_ERRORS_PER_FILE = {{ options.max_errors_per_file }}
{%- endif %}
{%- if csv_options %}
{%- if csv_options.header is not none %}
HEADER = {{ csv_options.header | upper }}
{%- endif %}
{%- if csv_options.delimiter %}
DELIMITER = '{{ csv_options.delimiter }}'
{%- endif %}
{%- if csv_options.newline %}
NEWLINE = '{{ csv_options.newline }}'
{%- endif %}
{%- if csv_options.quote %}
QUOTE = {{ csv_options.quote }}
{%- endif %}
{%- if csv_options.escape %}
ESCAPE = '{{ csv_options.escape }}'
{%- endif %}
{%- if csv_options.null_string %}
NULL_STRING = '{{ csv_options.null_string }}'
{%- endif %}
{%- if csv_options.empty_field_as_null is not none %}
EMPTY_FIELD_AS_NULL = {{ csv_options.empty_field_as_null | upper }}
{%- endif %}
{%- if csv_options.skip_blank_lines is not none %}
SKIP_BLANK_LINES = {{ csv_options.skip_blank_lines | upper }}
{%- endif %}
{%- if csv_options.date_format %}
DATE_FORMAT = '{{ csv_options.date_format }}'
{%- endif %}
{%- if csv_options.timestamp_format %}
TIMESTAMP_FORMAT = '{{ csv_options.timestamp_format }}'
{%- endif %}
{%- endif %}
{%- endif %}
{%- if credentials %}
CREDENTIALS = (AWS_KEY_ID = '{{credentials.aws_key_id}}' AWS_SECRET_KEY = '{{credentials.aws_secret_key}}')
{%- endif %}
{% endmacro %}
Loading