diff --git a/docs/source/conf.py b/docs/source/conf.py index a3996d49..3ad169dc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ project = 'semantic-link-labs' copyright = '2024, Microsoft and community' author = 'Microsoft and community' -release = '0.8.3' +release = '0.8.11' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -43,4 +43,4 @@ # List of packages we don't want to install in the environment autodoc_mock_imports = ['delta', 'synapse', 'jwt', 'semantic-link-sempy', 'pyspark', 'powerbiclient'] -napoleon_numpy_docstring = True \ No newline at end of file +napoleon_numpy_docstring = True diff --git a/environment.yml b/environment.yml index 01c321d6..fc8058a1 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ dependencies: - pytest-cov - pytest-mock - pip: - - semantic-link-sempy>=0.8.0 + - semantic-link-sempy>=0.8.3 - azure-identity==1.7.1 - azure-storage-blob>=12.9.0 - pandas-stubs diff --git a/notebooks/Model Optimization.ipynb b/notebooks/Model Optimization.ipynb index 45ab4d00..2908141c 100644 --- a/notebooks/Model Optimization.ipynb +++ b/notebooks/Model Optimization.ipynb @@ -1 +1,489 @@ -{"cells":[{"cell_type":"markdown","id":"5c27dfd1-4fe0-4a97-92e6-ddf78889aa93","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["### Install the latest .whl package\n","\n","Check [here](https://pypi.org/project/semantic-link-labs/) to see the latest version."]},{"cell_type":"code","execution_count":null,"id":"d5cae9db-cef9-48a8-a351-9c5fcc99645c","metadata":{"jupyter":{"outputs_hidden":true,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%pip install semantic-link-labs"]},{"cell_type":"markdown","id":"cd8de5a0","metadata":{},"source":["### Import the library"]},{"cell_type":"code","execution_count":null,"id":"5cc6eedf","metadata":{},"outputs":[],"source":["import sempy_labs as labs\n","from sempy_labs import lakehouse as lake\n","from sempy_labs import directlake\n","import sempy_labs.report as rep\n","\n","dataset_name = ''\n","workspace_name = None"]},{"cell_type":"markdown","id":"5a3fe6e8-b8aa-4447-812b-7931831e07fe","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["### Vertipaq Analyzer"]},{"cell_type":"code","execution_count":null,"id":"cde43b47-4ecc-46ae-9125-9674819c7eab","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["labs.vertipaq_analyzer(dataset = dataset_name, workspace = workspace_name)"]},{"cell_type":"markdown","id":"419a348f","metadata":{},"source":["Export the Vertipaq Analyzer results to a .zip file in your lakehouse"]},{"cell_type":"code","execution_count":null,"id":"8aa239b3","metadata":{},"outputs":[],"source":["labs.vertipaq_analyzer(dataset = dataset_name, workspace = workspace_name, export = 'zip')"]},{"cell_type":"markdown","id":"2dce0f4f","metadata":{},"source":["Export the Vertipaq Analyzer results to append to delta tables in your lakehouse."]},{"cell_type":"code","execution_count":null,"id":"aef93fc8","metadata":{},"outputs":[],"source":["labs.vertipaq_analyzer(dataset = dataset_name, workspace = workspace_name, export = 'table')"]},{"cell_type":"markdown","id":"1c62a802","metadata":{},"source":["Visualize the contents of an exported Vertipaq Analzyer .zip file."]},{"cell_type":"code","execution_count":null,"id":"9e349954","metadata":{},"outputs":[],"source":["labs.import_vertipaq_analyzer(folder_path = '', file_name = '')"]},{"cell_type":"markdown","id":"456ce0ff","metadata":{},"source":["### Best Practice Analzyer\n","\n","This runs the [standard rules](https://github.com/microsoft/Analysis-Services/tree/master/BestPracticeRules) for semantic models posted on Microsoft's GitHub."]},{"cell_type":"code","execution_count":null,"id":"0a3616b5-566e-414e-a225-fb850d6418dc","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["labs.run_model_bpa(dataset = dataset_name, workspace = workspace_name)"]},{"cell_type":"markdown","id":"6fb32a58","metadata":{},"source":["This runs the Best Practice Analyzer and exports the results to the 'modelbparesults' delta table in your Fabric lakehouse."]},{"cell_type":"code","execution_count":null,"id":"677851c3","metadata":{},"outputs":[],"source":["labs.run_model_bpa(dataset = dataset_name, workspace = workspace_name, export = True)"]},{"cell_type":"markdown","id":"64968a31","metadata":{},"source":["This runs the Best Practice Analyzer with the rules translated into Italian."]},{"cell_type":"code","execution_count":null,"id":"3c7d89e2","metadata":{},"outputs":[],"source":["labs.run_model_bpa(dataset = dataset_name, workspace = workspace_name, language = 'it-IT')"]},{"cell_type":"markdown","id":"255c30bb","metadata":{},"source":["
\n","Note: For analyzing model BPA results at scale, see the Best Practice Analyzer Report notebook (link below).\n","
\n","\n","[Best Practice Analyzer Notebook](https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Best%20Practice%20Analyzer%20Report.ipynb)"]},{"cell_type":"markdown","id":"bab18a61","metadata":{},"source":["### Run BPA using your own best practice rules"]},{"cell_type":"code","execution_count":null,"id":"59b89387","metadata":{},"outputs":[],"source":["import sempy\n","sempy.fabric._client._utils._init_analysis_services()\n","import Microsoft.AnalysisServices.Tabular as TOM\n","import pandas as pd\n","\n","dataset_name = ''\n","workspace_name = ''\n","\n","rules = pd.DataFrame(\n"," [\n"," (\n"," \"Performance\",\n"," \"Table\",\n"," \"Warning\",\n"," \"Rule name...\",\n"," lambda obj, tom: tom.is_calculated_table(table_name=obj.Name),\n"," 'Rule description...',\n"," '',\n"," ),\n"," (\n"," \"Performance\",\n"," \"Column\",\n"," \"Warning\",\n"," \"Do not use floating point data types\",\n"," lambda obj, tom: obj.DataType == TOM.DataType.Double,\n"," 'The \"Double\" floating point data type should be avoided, as it can result in unpredictable roundoff errors and decreased performance in certain scenarios. Use \"Int64\" or \"Decimal\" where appropriate (but note that \"Decimal\" is limited to 4 digits after the decimal sign).',\n"," )\n"," ],\n"," columns=[\n"," \"Category\",\n"," \"Scope\",\n"," \"Severity\",\n"," \"Rule Name\",\n"," \"Expression\",\n"," \"Description\",\n"," \"URL\",\n"," ],\n",")\n","\n","labs.run_model_bpa(dataset=dataset_name, workspace=workspace_name, rules=rules)"]},{"cell_type":"markdown","id":"8126a1a1","metadata":{},"source":["### Direct Lake\n","\n","Check if any lakehouse tables will hit the [Direct Lake guardrails](https://learn.microsoft.com/power-bi/enterprise/directlake-overview#fallback)."]},{"cell_type":"code","execution_count":null,"id":"e7397b15","metadata":{},"outputs":[],"source":["lake.get_lakehouse_tables(lakehouse = None, workspace = None, extended = True, count_rows = False)"]},{"cell_type":"code","execution_count":null,"id":"b30074cf","metadata":{},"outputs":[],"source":["lake.get_lakehouse_tables(lakehouse = None, workspace = None, extended = True, count_rows = False, export = True)"]},{"cell_type":"markdown","id":"99b84f2b","metadata":{},"source":["Check if any tables in a Direct Lake semantic model will fall back to DirectQuery."]},{"cell_type":"code","execution_count":null,"id":"f837be58","metadata":{},"outputs":[],"source":["directlake.check_fallback_reason(dataset = dataset_name, workspace = workspace_name)"]},{"cell_type":"markdown","id":"8f6df93e","metadata":{},"source":["### [OPTIMIZE](https://docs.delta.io/latest/optimizations-oss.html) your lakehouse delta tables."]},{"cell_type":"code","execution_count":null,"id":"e0262c9e","metadata":{},"outputs":[],"source":["lake.optimize_lakehouse_tables(tables = ['', ''], lakehouse = None, workspace = None)"]},{"cell_type":"markdown","id":"0091d6a0","metadata":{},"source":["Refresh/reframe your Direct Lake semantic model and restore the columns which were in memory prior to the refresh."]},{"cell_type":"code","execution_count":null,"id":"77eef082","metadata":{},"outputs":[],"source":["directlake.warm_direct_lake_cache_isresident(dataset = dataset_name, workspace = workspace_name)"]},{"cell_type":"markdown","id":"dae1a210","metadata":{},"source":["Ensure a warm cache for your users by putting the columns of a Direct Lake semantic model into memory based on the contents of a [perspective](https://learn.microsoft.com/analysis-services/tabular-models/perspectives-ssas-tabular?view=asallproducts-allversions).\n","\n","Perspectives can be created either in [Tabular Editor 3](https://github.com/TabularEditor/TabularEditor3/releases/latest) or in [Tabular Editor 2](https://github.com/TabularEditor/TabularEditor/releases/latest) using the [Perspective Editor](https://www.elegantbi.com/post/perspectiveeditor)."]},{"cell_type":"code","execution_count":null,"id":"43297001","metadata":{},"outputs":[],"source":["directlake.warm_direct_lake_cache_perspective(dataset = dataset_name, workspace = workspace_name, perspective = '', add_dependencies = True)"]}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"microsoft":{"language":"python"},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default"},"synapse_widget":{"state":{},"version":"0.1"},"widgets":{}},"nbformat":4,"nbformat_minor":5} +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5c27dfd1-4fe0-4a97-92e6-ddf78889aa93", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Install the latest .whl package\n", + "\n", + "Check [here](https://pypi.org/project/semantic-link-labs/) to see the latest version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5cae9db-cef9-48a8-a351-9c5fcc99645c", + "metadata": { + "jupyter": { + "outputs_hidden": true, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "%pip install semantic-link-labs" + ] + }, + { + "cell_type": "markdown", + "id": "cd8de5a0", + "metadata": {}, + "source": [ + "### Import the library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cc6eedf", + "metadata": {}, + "outputs": [], + "source": [ + "import sempy_labs as labs\n", + "from sempy_labs import lakehouse as lake\n", + "from sempy_labs import directlake\n", + "import sempy_labs.report as rep\n", + "\n", + "dataset_name = ''\n", + "workspace_name = None" + ] + }, + { + "cell_type": "markdown", + "id": "5a3fe6e8-b8aa-4447-812b-7931831e07fe", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Vertipaq Analyzer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cde43b47-4ecc-46ae-9125-9674819c7eab", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "labs.vertipaq_analyzer(dataset = dataset_name, workspace = workspace_name)" + ] + }, + { + "cell_type": "markdown", + "id": "419a348f", + "metadata": {}, + "source": [ + "Export the Vertipaq Analyzer results to a .zip file in your lakehouse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8aa239b3", + "metadata": {}, + "outputs": [], + "source": [ + "labs.vertipaq_analyzer(dataset = dataset_name, workspace = workspace_name, export = 'zip')" + ] + }, + { + "cell_type": "markdown", + "id": "2dce0f4f", + "metadata": {}, + "source": [ + "Export the Vertipaq Analyzer results to append to delta tables in your lakehouse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aef93fc8", + "metadata": {}, + "outputs": [], + "source": [ + "labs.vertipaq_analyzer(dataset = dataset_name, workspace = workspace_name, export = 'table')" + ] + }, + { + "cell_type": "markdown", + "id": "1c62a802", + "metadata": {}, + "source": [ + "Visualize the contents of an exported Vertipaq Analzyer .zip file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e349954", + "metadata": {}, + "outputs": [], + "source": [ + "labs.import_vertipaq_analyzer(folder_path = '', file_name = '')" + ] + }, + { + "cell_type": "markdown", + "id": "456ce0ff", + "metadata": {}, + "source": [ + "### Best Practice Analzyer\n", + "\n", + "This runs the [standard rules](https://github.com/microsoft/Analysis-Services/tree/master/BestPracticeRules) for semantic models posted on Microsoft's GitHub." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a3616b5-566e-414e-a225-fb850d6418dc", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "labs.run_model_bpa(dataset = dataset_name, workspace = workspace_name)" + ] + }, + { + "cell_type": "markdown", + "id": "6fb32a58", + "metadata": {}, + "source": [ + "This runs the Best Practice Analyzer and exports the results to the 'modelbparesults' delta table in your Fabric lakehouse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "677851c3", + "metadata": {}, + "outputs": [], + "source": [ + "labs.run_model_bpa(dataset = dataset_name, workspace = workspace_name, export = True)" + ] + }, + { + "cell_type": "markdown", + "id": "64968a31", + "metadata": {}, + "source": [ + "This runs the Best Practice Analyzer with the rules translated into Italian." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c7d89e2", + "metadata": {}, + "outputs": [], + "source": [ + "labs.run_model_bpa(dataset = dataset_name, workspace = workspace_name, language = 'it-IT')" + ] + }, + { + "cell_type": "markdown", + "id": "255c30bb", + "metadata": {}, + "source": [ + "
\n", + "Note: For analyzing model BPA results at scale, see the Best Practice Analyzer Report notebook (link below).\n", + "
\n", + "\n", + "[Best Practice Analyzer Notebook](https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Best%20Practice%20Analyzer%20Report.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "bab18a61", + "metadata": {}, + "source": [ + "### Run BPA using your own best practice rules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59b89387", + "metadata": {}, + "outputs": [], + "source": [ + "import sempy\n", + "sempy.fabric._client._utils._init_analysis_services()\n", + "import Microsoft.AnalysisServices.Tabular as TOM\n", + "import pandas as pd\n", + "\n", + "dataset_name = ''\n", + "workspace_name = ''\n", + "\n", + "rules = pd.DataFrame(\n", + " [\n", + " (\n", + " \"Performance\",\n", + " \"Table\",\n", + " \"Warning\",\n", + " \"Rule name...\",\n", + " lambda obj, tom: tom.is_calculated_table(table_name=obj.Name),\n", + " 'Rule description...',\n", + " '',\n", + " ),\n", + " (\n", + " \"Performance\",\n", + " \"Column\",\n", + " \"Warning\",\n", + " \"Do not use floating point data types\",\n", + " lambda obj, tom: obj.DataType == TOM.DataType.Double,\n", + " 'The \"Double\" floating point data type should be avoided, as it can result in unpredictable roundoff errors and decreased performance in certain scenarios. Use \"Int64\" or \"Decimal\" where appropriate (but note that \"Decimal\" is limited to 4 digits after the decimal sign).',\n", + " )\n", + " ],\n", + " columns=[\n", + " \"Category\",\n", + " \"Scope\",\n", + " \"Severity\",\n", + " \"Rule Name\",\n", + " \"Expression\",\n", + " \"Description\",\n", + " \"URL\",\n", + " ],\n", + ")\n", + "\n", + "labs.run_model_bpa(dataset=dataset_name, workspace=workspace_name, rules=rules)" + ] + }, + { + "cell_type": "markdown", + "id": "d5933de1", + "metadata": {}, + "source": [ + "### Tracing\n", + "\n", + "Trace a set of DAX queries and capture the result of the DAX queries in dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d04c58f", + "metadata": {}, + "outputs": [], + "source": [ + "trace_result, query_result = labs.trace_dax(\n", + " dataset = dataset_name,\n", + " dax_queries = {\n", + " \"Query1\": \"\"\" EVALUATE SUMMARIZECOLUMNS('DimProduct'[Color], \"1\", [Sales Amount]) \"\"\",\n", + " \"Query2\": \"\"\" EVALUATE SUMMARIZECOLUMNS(\"1\", [Sales Amount]) \"\"\",\n", + " },\n", + " workspace = workspace_name,\n", + " clear_cache_before_run=False,\n", + " clear_cache_before_each_query=False,\n", + " rest_time=2,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f2586a1b", + "metadata": {}, + "source": [ + "Show the trace results of the DAX queries in a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e49d9043", + "metadata": {}, + "outputs": [], + "source": [ + "trace_result\n", + "#trace_result.sort_values(by='Duration', ascending=False) # Order the trace results by the highest duration queries." + ] + }, + { + "cell_type": "markdown", + "id": "0d42a984", + "metadata": {}, + "source": [ + "Show the result of each DAX query in its own dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1df254c6", + "metadata": {}, + "outputs": [], + "source": [ + "for query, query_value in query_result.items():\n", + " print(query)\n", + " display(query_value)" + ] + }, + { + "cell_type": "markdown", + "id": "8126a1a1", + "metadata": {}, + "source": [ + "### Direct Lake\n", + "\n", + "Check if any lakehouse tables will hit the [Direct Lake guardrails](https://learn.microsoft.com/power-bi/enterprise/directlake-overview#fallback)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7397b15", + "metadata": {}, + "outputs": [], + "source": [ + "lake.get_lakehouse_tables(lakehouse = None, workspace = None, extended = True, count_rows = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b30074cf", + "metadata": {}, + "outputs": [], + "source": [ + "lake.get_lakehouse_tables(lakehouse = None, workspace = None, extended = True, count_rows = False, export = True)" + ] + }, + { + "cell_type": "markdown", + "id": "99b84f2b", + "metadata": {}, + "source": [ + "Check if any tables in a Direct Lake semantic model will fall back to DirectQuery." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f837be58", + "metadata": {}, + "outputs": [], + "source": [ + "directlake.check_fallback_reason(dataset = dataset_name, workspace = workspace_name)" + ] + }, + { + "cell_type": "markdown", + "id": "8f6df93e", + "metadata": {}, + "source": [ + "### [OPTIMIZE](https://docs.delta.io/latest/optimizations-oss.html) your lakehouse delta tables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0262c9e", + "metadata": {}, + "outputs": [], + "source": [ + "lake.optimize_lakehouse_tables(tables = ['', ''], lakehouse = None, workspace = None)" + ] + }, + { + "cell_type": "markdown", + "id": "0091d6a0", + "metadata": {}, + "source": [ + "Refresh/reframe your Direct Lake semantic model and restore the columns which were in memory prior to the refresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77eef082", + "metadata": {}, + "outputs": [], + "source": [ + "directlake.warm_direct_lake_cache_isresident(dataset = dataset_name, workspace = workspace_name)" + ] + }, + { + "cell_type": "markdown", + "id": "dae1a210", + "metadata": {}, + "source": [ + "Ensure a warm cache for your users by putting the columns of a Direct Lake semantic model into memory based on the contents of a [perspective](https://learn.microsoft.com/analysis-services/tabular-models/perspectives-ssas-tabular?view=asallproducts-allversions).\n", + "\n", + "Perspectives can be created either in [Tabular Editor 3](https://github.com/TabularEditor/TabularEditor3/releases/latest) or in [Tabular Editor 2](https://github.com/TabularEditor/TabularEditor/releases/latest) using the [Perspective Editor](https://www.elegantbi.com/post/perspectiveeditor)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43297001", + "metadata": {}, + "outputs": [], + "source": [ + "directlake.warm_direct_lake_cache_perspective(dataset = dataset_name, workspace = workspace_name, perspective = '', add_dependencies = True)" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python" + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "spark_compute": { + "compute_id": "/trident/default" + }, + "synapse_widget": { + "state": {}, + "version": "0.1" + }, + "widgets": {} + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 20746045..eaa24e1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name="semantic-link-labs" authors = [ { name = "Microsoft Corporation" }, ] -version="0.8.3" +version="0.8.11" description="Semantic Link Labs for Microsoft Fabric" readme="README.md" requires-python=">=3.10,<3.12" @@ -23,7 +23,7 @@ classifiers = [ license= { text = "MIT License" } dependencies = [ - "semantic-link-sempy>=0.8.0", + "semantic-link-sempy>=0.8.3", "anytree", "powerbiclient", "polib", diff --git a/src/sempy_labs/__init__.py b/src/sempy_labs/__init__.py index 71866616..f7015193 100644 --- a/src/sempy_labs/__init__.py +++ b/src/sempy_labs/__init__.py @@ -1,3 +1,6 @@ +from sempy_labs._documentation import ( + save_semantic_model_metadata, +) from sempy_labs._external_data_shares import ( list_external_data_shares_in_item, create_external_data_share, @@ -135,7 +138,12 @@ # create_connection_vnet, # create_connection_on_prem ) -from sempy_labs._dax import evaluate_dax_impersonation +from sempy_labs._dax import ( + evaluate_dax_impersonation, + trace_dax, + dax_perf_test, + run_benchmark, +) from sempy_labs._generate_semantic_model import ( create_blank_semantic_model, create_semantic_model_from_bim, @@ -373,4 +381,8 @@ "revoke_external_data_share", "migrate_fabric_trial_capacity", "create_resource_group", + "trace_dax", + "save_semantic_model_metadata", + "dax_perf_test", + "run_benchmark", ] diff --git a/src/sempy_labs/_dax.py b/src/sempy_labs/_dax.py index 798194cd..45e7015c 100644 --- a/src/sempy_labs/_dax.py +++ b/src/sempy_labs/_dax.py @@ -1,11 +1,24 @@ +import sempy import sempy.fabric as fabric import pandas as pd +import datetime +from typing import Optional, Tuple +from sempy._utils._log import log +import time +import sempy_labs._icons as icons from sempy_labs._helper_functions import ( resolve_dataset_id, resolve_workspace_name_and_id, + _get_max_run_id, + resolve_lakehouse_name, + save_as_delta_table, + _resolve_workspace_capacity_name_id_sku, + format_dax_object_name, ) -from typing import Optional -from sempy._utils._log import log +from sempy_labs.lakehouse._lakehouse import lakehouse_attached +from sempy_labs._clear_cache import clear_cache +from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables +import tqdm @log @@ -62,3 +75,709 @@ def evaluate_dax_impersonation( df = pd.DataFrame(data_rows, columns=column_names) return df + + +def _get_dax_query_dependencies_all( + dataset: str, + dax_string: str, + workspace: Optional[str] = None, +) -> pd.DataFrame: + + from sempy_labs._model_dependencies import get_model_calc_dependencies + + if workspace is None: + workspace = fabric.resolve_workspace_name(workspace) + + # Escape quotes in dax + dax_string = dax_string.replace('"', '""') + final_query = f""" + EVALUATE + VAR source_query = "{dax_string}" + VAR all_dependencies = SELECTCOLUMNS( + INFO.CALCDEPENDENCY("QUERY", source_query), + "Referenced Object Type",[REFERENCED_OBJECT_TYPE], + "Referenced Table", [REFERENCED_TABLE], + "Referenced Object", [REFERENCED_OBJECT] + ) + RETURN all_dependencies + """ + dep = fabric.evaluate_dax( + dataset=dataset, workspace=workspace, dax_string=final_query + ) + + # Clean up column names and values (remove outside square brackets, underscorees in object type) + dep.columns = dep.columns.map(lambda x: x[1:-1]) + dep["Referenced Object Type"] = ( + dep["Referenced Object Type"].str.replace("_", " ").str.title() + ) + dep + + # Dataframe df will contain the output of all dependencies of the objects used in the query + df = dep.copy() + + cd = get_model_calc_dependencies(dataset=dataset, workspace=workspace) + + for _, r in dep.iterrows(): + ot = r["Referenced Object Type"] + object_name = r["Referenced Object"] + table_name = r["Referenced Table"] + cd_filt = cd[ + (cd["Object Type"] == ot) + & (cd["Object Name"] == object_name) + & (cd["Table Name"] == table_name) + ] + + # Adds in the dependencies of each object used in the query (i.e. relationship etc.) + if len(cd_filt) > 0: + subset = cd_filt[ + ["Referenced Object Type", "Referenced Table", "Referenced Object"] + ] + df = pd.concat([df, subset], ignore_index=True) + + df.columns = df.columns.map(lambda x: x.replace("Referenced ", "")) + df = df[(~df["Object"].str.startswith("RowNumber-"))] + # Remove duplicates + df = df.drop_duplicates().reset_index(drop=True) + + return df + + +@log +def get_dax_query_dependencies( + dataset: str, + dax_string: str, + put_in_memory: bool = False, + show_vertipaq_stats: bool = True, + workspace: Optional[str] = None, +) -> pd.DataFrame: + """ + Obtains the columns on which a DAX query depends, including model dependencies. Shows Vertipaq statistics (i.e. Total Size, Data Size, Dictionary Size, Hierarchy Size) for easy prioritizing. + + Parameters + ---------- + dataset : str + Name of the semantic model. + dax_string : str + The DAX query. + put_in_memory : bool, default=False + If True, ensures that the dependent columns are put into memory in order to give realistic Vertipaq stats (i.e. Total Size etc.). + show_vertipaq_stats : bool, default=True + If True, shows Vertipaq statistics. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + pandas.DataFrame + A pandas dataframe showing the dependent columns of a given DAX query including model dependencies. + """ + + from sempy_labs._model_dependencies import get_model_calc_dependencies + + if workspace is None: + workspace = fabric.resolve_workspace_name(workspace) + + if put_in_memory: + show_vertipaq_stats = True + + # Escape quotes in dax + dax_string = dax_string.replace('"', '""') + final_query = f""" + EVALUATE + VAR source_query = "{dax_string}" + VAR all_dependencies = SELECTCOLUMNS( + INFO.CALCDEPENDENCY("QUERY", source_query), + "Referenced Object Type",[REFERENCED_OBJECT_TYPE], + "Referenced Table", [REFERENCED_TABLE], + "Referenced Object", [REFERENCED_OBJECT] + ) + RETURN all_dependencies + """ + dep = fabric.evaluate_dax( + dataset=dataset, workspace=workspace, dax_string=final_query + ) + + # Clean up column names and values (remove outside square brackets, underscorees in object type) + dep.columns = dep.columns.map(lambda x: x[1:-1]) + dep["Referenced Object Type"] = ( + dep["Referenced Object Type"].str.replace("_", " ").str.title() + ) + dep + + # Dataframe df will contain the output of all dependencies of the objects used in the query + df = dep.copy() + + cd = get_model_calc_dependencies(dataset=dataset, workspace=workspace) + + for _, r in dep.iterrows(): + ot = r["Referenced Object Type"] + object_name = r["Referenced Object"] + table_name = r["Referenced Table"] + cd_filt = cd[ + (cd["Object Type"] == ot) + & (cd["Object Name"] == object_name) + & (cd["Table Name"] == table_name) + ] + + # Adds in the dependencies of each object used in the query (i.e. relationship etc.) + if len(cd_filt) > 0: + subset = cd_filt[ + ["Referenced Object Type", "Referenced Table", "Referenced Object"] + ] + df = pd.concat([df, subset], ignore_index=True) + + df.columns = df.columns.map(lambda x: x.replace("Referenced ", "")) + # Remove duplicates + df = df.drop_duplicates().reset_index(drop=True) + # Only show columns and remove the rownumber column + df = df[ + (df["Object Type"].isin(["Column", "Calc Column"])) + & (~df["Object"].str.startswith("RowNumber-")) + ] + + # Get vertipaq stats, filter to just the objects in the df dataframe + if show_vertipaq_stats: + df["Full Object"] = format_dax_object_name(df["Table"], df["Object"]) + dfC = fabric.list_columns(dataset=dataset, workspace=workspace, extended=True) + dfC["Full Object"] = format_dax_object_name( + dfC["Table Name"], dfC["Column Name"] + ) + + dfC_filtered = dfC[dfC["Full Object"].isin(df["Full Object"].values)][ + [ + "Table Name", + "Column Name", + "Total Size", + "Data Size", + "Dictionary Size", + "Hierarchy Size", + "Is Resident", + "Full Object", + ] + ].reset_index(drop=True) + + if put_in_memory: + not_in_memory = dfC_filtered[dfC_filtered["Is Resident"] == False] + + if len(not_in_memory) > 0: + tbls = not_in_memory["Table Name"].unique() + + # Run basic query to get columns into memory; completed one table at a time (so as not to overload the capacity) + for table_name in (bar := tqdm(tbls)): + bar.set_description(f"Warming the '{table_name}' table...") + css = ", ".join( + not_in_memory[not_in_memory["Table Name"] == table_name][ + "Full Object" + ] + .astype(str) + .tolist() + ) + dax = f"""EVALUATE TOPN(1,SUMMARIZECOLUMNS({css}))""" + fabric.evaluate_dax( + dataset=dataset, dax_string=dax, workspace=workspace + ) + + # Get column stats again + dfC = fabric.list_columns( + dataset=dataset, workspace=workspace, extended=True + ) + dfC["Full Object"] = format_dax_object_name( + dfC["Table Name"], dfC["Column Name"] + ) + + dfC_filtered = dfC[dfC["Full Object"].isin(df["Full Object"].values)][ + [ + "Table Name", + "Column Name", + "Total Size", + "Data Size", + "Dictionary Size", + "Hierarchy Size", + "Is Resident", + "Full Object", + ] + ].reset_index(drop=True) + + if show_vertipaq_stats: + dfC_filtered.drop(["Full Object"], axis=1, inplace=True) + + return dfC_filtered + + +@log +def get_dax_query_memory_size( + dataset: str, dax_string: str, workspace: Optional[str] = None +) -> int: + """ + Obtains the total size, in bytes, used by all columns that a DAX query depends on. + + Parameters + ---------- + dataset : str + Name of the semantic model. + dax_string : str + The DAX query. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + int + The total size, in bytes, used by all columns that the DAX query depends on. + """ + + if workspace is None: + workspace = fabric.resolve_workspace_name(workspace) + + df = get_dax_query_dependencies( + dataset=dataset, workspace=workspace, dax_string=dax_string, put_in_memory=True + ) + + return df["Total Size"].sum() + + +@log +def trace_dax( + dataset: str, + dax_queries: dict, + rest_time: int = 2, + clear_cache_before_run: bool = False, + clear_cache_before_each_query: bool = False, + trace_vertipaq_se: bool = False, + trace_direct_query: bool = False, + enable_execution_metrics: bool = False, + workspace: Optional[str] = None, +) -> Tuple[pd.DataFrame, dict]: + """ + Runs a SQL Profiler trace over a set of DAX queries. + + Parameters + ---------- + dataset : str + Name of the semantic model. + dax_queries : dict + The dax queries to run in a dictionary format. Here is an example: + { + "Sales Amount Test", """ """ EVALUATE SUMMARIZECOLUMNS("Sales Amount", [Sales Amount]) """ """, + "Order Quantity with Product", """ """ EVALUATE SUMMARIZECOLUMNS('Product'[Color], "Order Qty", [Order Qty]) """ """, + } + rest_time : int, default=2 + Rest time (in seconds) between the execution of each DAX query. + clear_cache_before_run : bool, default=False + If True, clears the cache before running any DAX queries. + clear_cache_before_each_query : bool, default=False + If True, clears the cache before running each DAX query. + trace_vertipaq_se : bool, default=False + If True, adds the following events to the trace: VertiPaq SE Query Begin, VertiPaq SE Query End, VertiPaq SE Query Cache Match + trace_direct_query : bool, default=False + If True, adds the following events to the trace: Direct Query Begin, Direct Query End + enable_execution_metrics : bool, default=False + If True, adds the `Execution Metrics `_ to the trace. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + Tuple[pandas.DataFrame, dict] + A pandas dataframe showing the SQL profiler trace results of the DAX queries. + A dictionary of the query results in pandas dataframes. + """ + + if workspace is None: + workspace = fabric.resolve_workspace_name() + + base_cols = ["EventClass", "EventSubclass", "CurrentTime", "NTUserName", "TextData"] + begin_cols = base_cols + ["StartTime"] + end_cols = base_cols + ["StartTime", "EndTime", "Duration", "CpuTime", "Success"] + dq_cols = [ + "EventClass", + "CurrentTime", + "StartTime", + "EndTime", + "Duration", + "CpuTime", + "Success", + "Error", + "TextData", + ] + + event_schema = { + "QueryBegin": begin_cols + ["ApplicationName"], + "QueryEnd": end_cols + ["ApplicationName"], + } + + if trace_vertipaq_se: + event_schema["VertiPaqSEQueryBegin"] = begin_cols + event_schema["VertiPaqSEQueryEnd"] = end_cols + event_schema["VertiPaqSEQueryCacheMatch"] = base_cols + if trace_direct_query: + event_schema["DirectQueryBegin"] = dq_cols + event_schema["DirectQueryEnd"] = dq_cols + + if enable_execution_metrics: + event_schema["ExecutionMetrics"] = ["EventClass", "TextData", "ApplicationName"] + + query_results = {} + + if clear_cache_before_run: + clear_cache(dataset=dataset, workspace=workspace) + + # Establish trace connection + with fabric.create_trace_connection( + dataset=dataset, workspace=workspace + ) as trace_connection: + with trace_connection.create_trace(event_schema) as trace: + trace.start() + # Loop through DAX queries + for i, (name, dax) in enumerate(dax_queries.items()): + # Clear cache for each query but not if done already before the run began + if clear_cache_before_each_query and not ( + i == 0 and clear_cache_before_run + ): + clear_cache(dataset=dataset, workspace=workspace) + + result = fabric.evaluate_dax( + dataset=dataset, workspace=workspace, dax_string=dax + ) + # Add results to output + query_results[name] = result + + time.sleep(rest_time) + print(f"{icons.green_dot} The '{name}' query has completed.") + + df = trace.stop() + # Allow time to collect trace results + time.sleep(5) + + # Name queries per dictionary + query_names = list(dax_queries.keys()) + query_begin = df["Event Class"] == "QueryBegin" + df["Query Name"] = (query_begin).cumsum() + df["Query Name"] = df["Query Name"].where(query_begin, None).ffill() + df["Query Name"] = pd.to_numeric(df["Query Name"], downcast="integer") + df["Query Name"] = df["Query Name"].map(lambda x: query_names[x - 1]) + + return df, query_results + + +@log +def dax_perf_test( + dataset: str, + dax_queries: dict, + clear_cache_before_run: bool = False, + refresh_type: Optional[str] = None, + rest_time: int = 2, + workspace: Optional[str] = None, +) -> Tuple[pd.DataFrame, dict]: + """ + Runs a performance test on a set of DAX queries. + + Parameters + ---------- + dataset : str + Name of the semantic model. + dax_queries : dict + The dax queries to run in a dictionary format. Here is an example: + { + "Sales Amount Test", """ """ EVALUATE SUMMARIZECOLUMNS("Sales Amount", [Sales Amount]) """ """, + "Order Quantity with Product", """ """ EVALUATE SUMMARIZECOLUMNS('Product'[Color], "Order Qty", [Order Qty]) """ """, + } + clear_cache_before_run : bool, default=False + refresh_type : str, default=None + rest_time : int, default=2 + Rest time (in seconds) between the execution of each DAX query. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + Tuple[pandas.DataFrame, dict] + A pandas dataframe showing the SQL profiler trace results of the DAX queries. + A dictionary of the query results in pandas dataframes. + """ + from sempy_labs._refresh_semantic_model import refresh_semantic_model + from sempy_labs._clear_cache import clear_cache + + if workspace is None: + workspace = fabric.resolve_workspace_name() + + base_cols = ["EventClass", "EventSubclass", "CurrentTime", "NTUserName", "TextData"] + begin_cols = base_cols + ["StartTime"] + end_cols = base_cols + ["StartTime", "EndTime", "Duration", "CpuTime", "Success"] + + event_schema = { + "QueryBegin": begin_cols + ["ApplicationName"], + "QueryEnd": end_cols + ["ApplicationName"], + } + + event_schema["VertiPaqSEQueryBegin"] = begin_cols + event_schema["VertiPaqSEQueryEnd"] = end_cols + event_schema["VertiPaqSEQueryCacheMatch"] = base_cols + + query_results = {} + + # Establish trace connection + with fabric.create_trace_connection( + dataset=dataset, workspace=workspace + ) as trace_connection: + with trace_connection.create_trace(event_schema) as trace: + trace.start() + print(f"{icons.in_progress} Starting performance testing...") + # Loop through DAX queries + for i, (name, dax) in enumerate(dax_queries.items()): + + if clear_cache_before_run: + clear_cache(dataset=dataset, workspace=workspace) + if refresh_type is not None: + refresh_semantic_model( + dataset=dataset, workspace=workspace, refresh_type=refresh_type + ) + + fabric.evaluate_dax( + dataset=dataset, workspace=workspace, dax_string="""EVALUATE {1}""" + ) + # Run DAX Query + result = fabric.evaluate_dax( + dataset=dataset, workspace=workspace, dax_string=dax + ) + + # Add results to output + query_results[name] = result + + time.sleep(rest_time) + print(f"{icons.green_dot} The '{name}' query has completed.") + + df = trace.stop() + # Allow time to collect trace results + time.sleep(5) + + # Step 1: Filter out unnecessary operations + query_names = list(dax_queries.keys()) + df = df[ + ~df["Application Name"].isin(["PowerBI", "PowerBIEIM"]) + & (~df["Text Data"].str.startswith("EVALUATE {1}")) + ] + query_begin = df["Event Class"] == "QueryBegin" + # Step 2: Name queries per dictionary + suffix = "_removeXXX" + query_names_full = [ + item + for query in query_names + for item in (f"{query}{suffix}", query) + ] + # Step 3: Assign query names by group and convert to integer + df["Query Name"] = (query_begin).cumsum() + df["Query Name"] = df["Query Name"].where(query_begin, None).ffill() + df["Query Name"] = pd.to_numeric(df["Query Name"], downcast="integer") + # Step 4: Map to full query names + df["Query Name"] = df["Query Name"].map( + lambda x: query_names_full[x - 1] + ) + df = df[~df["Query Name"].str.endswith(suffix)] + + df = df.reset_index(drop=True) + + return df, query_results + + +def _validate_cache_type(cache_type: str) -> str: + + cache_type = cache_type.lower() + cache_types = ["warm", "cold"] + if cache_type not in cache_types: + raise ValueError( + f"{icons.red_dot} Invalid cache type. Valid options: {cache_types}." + ) + return cache_type + + +def run_benchmark( + dataset: str, + dax_queries: dict, + cache_type: str = "warm", + workspace: Optional[str] = None, +): + + from sempy_labs._documentation import save_semantic_model_metadata + + if workspace is None: + workspace = fabric.resolve_workspace_name() + + workspace_id = fabric.resolve_workspace_id(workspace) + capacity_id, capacity_name, sku, region = _resolve_workspace_capacity_name_id_sku( + workspace + ) + dataset_id = resolve_dataset_id(dataset, workspace) + cache_type = _validate_cache_type(cache_type) + + # Get RunId + table_name = "SLL_Measures" + + if not lakehouse_attached(): + raise ValueError( + f"{icons.red_dot} A lakehouse must be attached to the notebook." + ) + + lakehouse_id = fabric.get_lakehouse_id() + lakehouse_workspace = fabric.resolve_workspace_name() + lakehouse_name = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) + + dfLT = get_lakehouse_tables(lakehouse_name, lakehouse_workspace) + dfLT_filt = dfLT[dfLT["Table Name"] == table_name] + if len(dfLT_filt) == 0: + run_id = 1 + else: + run_id = _get_max_run_id(lakehouse=lakehouse_name, table_name=table_name) + 1 + time_stamp = datetime.datetime.now() + + dfC = save_semantic_model_metadata( + dataset=dataset, workspace=workspace, run_id=run_id, time_stamp=time_stamp + ) + + # Run and save trace data + trace_result, query_result = dax_perf_test( + dataset=dataset, + workspace=workspace, + dax_queries=dax_queries, + cache_type=cache_type, + ) + + trace_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Query_Name": "string", + "Query_Text": "string", + "Cache_Type": "string", + "Duration": "long", + "SE_Duration": "long", + "SE_CPU": "long", + "SE_Cache": "long", + "SE_Queries": "long", + "Column_Dependencies": "string", + "Column_Dependencies_Size": "long", + "Measure_Dependencies": "string", + "Relationship_Dependencies": "string", + "RunId": "long", + "Timestamp": "timestamp", + } + df = pd.DataFrame(columns=list(trace_schema.keys())) + + for query_name in trace_result["Query Name"].unique().tolist(): + df_trace = trace_result[trace_result["Query Name"] == query_name] + # Capture Query Text + query_begin = df_trace[df_trace["Event Class"] == "QueryBegin"] + query_text = query_begin["Text Data"].iloc[0] + + # Filter to only end events; filter out internal events + df_trace = df_trace[ + (~df_trace["Event Subclass"].str.endswith("Internal")) + & (df_trace["Event Class"].str.endswith("End")) + ] + + # SE Cache: # of times the cache match event occurred + se_cache = len(df_trace[df_trace["Event Class"] == "VertiPaqSEQueryCacheMatch"]) + + # Total Time -> QueryEnd Duration + total_duration = df_trace[df_trace["Event Class"] == "QueryEnd"][ + "Duration" + ].sum() + + # SE Duration: Sum of Duration for Vertipaq End or DQEnd event + se_duration = df_trace[ + (df_trace["Event Class"].str.endswith("End")) + & (df_trace["Event Class"] != "QueryEnd") + ]["Duration"].sum() + + # SE CPU: Sum of CPU for Vertipaq End or DQEnd event + se_cpu = se_duration = df_trace[ + (df_trace["Event Class"].str.endswith("End")) + & (df_trace["Event Class"] != "QueryEnd") + ]["Cpu Time"].sum() + + # SE Queries: # of times the Vertipaq End or DQEnd event occurred + se_queries = len( + df_trace[ + (df_trace["Event Class"].str.endswith("End")) + & (df_trace["Event Class"] != "QueryEnd") + ] + ) + + # Collect query dependencies + dep = _get_dax_query_dependencies_all( + dataset=dataset, + workspace=workspace, + dax_string=query_text, + ) + + # Column dependencies + filtered_dep = dep[dep["Object Type"].isin(["Column", "Calc Column"])][ + ["Table", "Object"] + ] + columns_used = [ + f"'{table}'[{obj}]" + for table, obj in zip(filtered_dep["Table"], filtered_dep["Object"]) + ] + dfC["Object"] = format_dax_object_name(dfC["Table_Name"], dfC["Column_Name"]) + dfC_filt = dfC[dfC["Object"].isin(columns_used)] + total_size = dfC_filt["Total_Size"].sum() + + # Measure dependencies + measures_used = dep[dep["Object Type"] == "Measure"]["Object"].tolist() + + # Relationship dependencies + relationships_used = dep[dep["Object Type"] == "Relationship"][ + "Object" + ].tolist() + + new_data = { + "Capacity_Name": capacity_name, + "Capacity_Id": capacity_id, + "SKU": sku, + "Region": region, + "Workspace_Name": workspace, + "Workspace_Id": workspace_id, + "Dataset_Name": dataset, + "Dataset_Id": dataset_id, + "Query_Name": str(query_name), + "Query_Text": query_text, + "Cache_Type": cache_type, + "Duration": total_duration, + "SE_Duration": se_duration, + "SE_CPU": se_cpu, + "SE_Cache": se_cache, + "SE_Queries": se_queries, + "Column_Dependencies": str(columns_used), + "Column_Dependencies_Size": total_size, + "Measure_Dependencies": str(measures_used), + "Relationship_Dependencies": str(relationships_used), + "RunId": run_id, + "Timestamp": time_stamp, + } + + if df.empty: + df = pd.DataFrame(new_data, index=[0]) + else: + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + + df["Query_Text"] = df["Query_Text"].astype(str) + + save_as_delta_table( + dataframe=df, + delta_table_name="SLL_PerfBenchmark", + write_mode="append", + schema=trace_schema, + ) diff --git a/src/sempy_labs/_documentation.py b/src/sempy_labs/_documentation.py index 5d957489..6627112e 100644 --- a/src/sempy_labs/_documentation.py +++ b/src/sempy_labs/_documentation.py @@ -2,6 +2,17 @@ import sempy.fabric as fabric import pandas as pd from typing import List, Optional +import sempy_labs._icons as icons +import datetime +from sempy_labs._helper_functions import ( + save_as_delta_table, + _conv_model_size, + _resolve_workspace_capacity_name_id_sku, + resolve_dataset_id, + _get_max_run_id, + resolve_workspace_name_and_id, +) +from sempy_labs.lakehouse import lakehouse_attached def list_all_items(workspaces: Optional[str | List[str]] = None): @@ -142,3 +153,302 @@ def get_calc_column_expression(table_name, column_name): ) return df + + +def save_semantic_model_metadata( + dataset: str, + workspace: Optional[str] = None, + run_id: Optional[int] = None, + time_stamp: Optional[datetime.datetime] = None, +): + + from sempy_labs._list_functions import list_tables + + (workspace, workspace_id) = resolve_workspace_name_and_id(workspace) + + if run_id is None: + run_id = _get_run_id(table_name="SLL_Measures") + + if time_stamp is None: + time_stamp = datetime.datetime.now() + + capacity_id, capacity_name, sku, region = _resolve_workspace_capacity_name_id_sku( + workspace + ) + dataset_id = resolve_dataset_id(dataset, workspace) + + print(f"{icons.in_progress} Collecting semantic model metadata...") + dfM = fabric.list_measures(dataset=dataset, workspace=workspace)[ + ["Table Name", "Measure Name", "Measure Expression"] + ] + dfC = fabric.list_columns(dataset=dataset, workspace=workspace, extended=True)[ + [ + "Table Name", + "Column Name", + "Type", + "Data Type", + "Column Cardinality", + "Total Size", + "Data Size", + "Dictionary Size", + "Hierarchy Size", + "Encoding", + ] + ] + + total_size = dfC["Total Size"].sum() + total_size = _conv_model_size(db_total_size=total_size) + dfModel = pd.DataFrame({"Model Size": [total_size]}) + + dfC = dfC[dfC["Type"] != "RowNumber"] + dfT = list_tables(dataset=dataset, workspace=workspace, extended=True)[ + ["Name", "Type", "Row Count"] + ] + dfT = dfT.rename(columns={"Name": "Table Name"}) + dfR = fabric.list_relationships(dataset=dataset, workspace=workspace) + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace, extended=True)[ + [ + "Table Name", + "Partition Name", + "Mode", + "Source Type", + "Query", + "Refreshed Time", + "Modified Time", + "Record Count", + "Records per Segment", + "Segment Count", + ] + ] + + dfRLS = fabric.get_row_level_security_permissions( + dataset=dataset, workspace=workspace + ) + + dfH = fabric.list_hierarchies(dataset=dataset, workspace=workspace) + dfCI = fabric.list_calculation_items(dataset=dataset, workspace=workspace) + + def add_cols(df, run_id, time_stamp): + df.insert(0, "Capacity Name", capacity_name) + df.insert(1, "Capacity Id", capacity_id) + df.insert(2, "SKU", sku) + df.insert(3, "Region", region) + df.insert(4, "Workspace Name", workspace) + df.insert(5, "Workspace Id", workspace_id) + df.insert(6, "Dataset Name", dataset) + df.insert(7, "Dataset Id", dataset_id) + df["RunId"] = run_id + df["Timestamp"] = time_stamp + + return df + + dataframes = [dfM, dfC, dfT, dfR, dfP, dfRLS, dfModel, dfH, dfCI] + dataframes = [add_cols(df, run_id, time_stamp) for df in dataframes] + dfM, dfC, dfT, dfR, dfP, dfRLS, dfModel, dfH, dfCI = dataframes + + dfModel_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Model_Size": "long", + "RunId": "long", + "Timestamp": "timestamp", + } + dfM_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Table_Name": "string", + "Measure_Name": "string", + "Measure_Expression": "string", + "RunId": "long", + "Timestamp": "timestamp", + } + dfC_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Table_Name": "string", + "Column_Name": "string", + "Type": "string", + "Data_Type": "string", + "Column_Cardinality": "long", + "Total_Size": "long", + "Data_Size": "long", + "Dictionary_Size": "long", + "Hierarchy_Size": "long", + "Encoding": "string", + "RunId": "long", + "Timestamp": "timestamp", + } + dfT_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Table_Name": "string", + "Type": "string", + "Row_Count": "long", + "Table_Size": "long", + "RunId": "long", + "Timestamp": "timestamp", + } + dfP_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Table_Name": "string", + "Partition_Name": "string", + "Mode": "string", + "Source_Type": "string", + "Query": "string", + "Refreshed_Time": "timestamp", + "Modified_Time": "timestamp", + "Record_Count": "long", + "Records_per_Segment": "double", + "Segment_Count": "long", + "RunId": "long", + "Timestamp": "timestamp", + } + dfH_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Table_Name": "string", + "Column_Name": "string", + "Hierarchy_Name": "string", + "Hierarchy_Description": "string", + "Hierarchy_State": "string", + "Level_Name": "string", + "Level_Description": "string", + "Level_Ordinal": "long", + "RunId": "long", + "Timestamp": "timestamp", + } + dfCI_schema = { + "Capacity_Name": "string", + "Capacity_Id": "string", + "SKU": "string", + "Region": "string", + "Workspace_Name": "string", + "Workspace_Id": "string", + "Dataset_Name": "string", + "Dataset_Id": "string", + "Calculation_Group_Name": "string", + "Hidden": "bool", + "Precedence": "long", + "Description": "string", + "Calculation_Item_Name": "string", + "Ordinal": "long", + "Expression": "string", + "Format_String_Expression": "string", + "State": "string", + "Error_Message": "string", + "RunId": "long", + "Timestamp": "timestamp", + } + + dfs = { + "Measures": [dfM, dfM_schema], + "Columns": [dfC, dfC_schema], + "Tables": [dfT, dfT_schema], + "Relationships": [dfR, None], + "Partitions": [dfP, dfP_schema], + "RowLevelSecurity": [dfRLS, None], + "Model": [dfModel, dfModel_schema], + "Hierarchies": [dfH, dfH_schema], + "CalculationItems": [dfCI, dfCI_schema], + } + print(f"{icons.in_progress} Saving semantic model metadata...") + for name, (df, df_schema) in dfs.items(): + if not df.empty: + save_as_delta_table( + dataframe=df, + delta_table_name=f"SLL_{name}", + write_mode="append", + schema=df_schema, + ) + else: + print( + f"{icons.yellow_dot} The '{dataset}' semantic model within the '{workspace}' contains no {name.lower()}." + ) + + return dfC + + +def save_semantic_model_metadata_bulk(workspace: Optional[str | List[str]] = None): + + time_stamp = datetime.datetime.now() + run_id = _get_run_id(table_name="SLL_Measures") + if isinstance(workspace, str): + workspace = [workspace] + + dfW = fabric.list_workspaces() + if workspace is None: + workspaces = dfW["Name"].tolist() + else: + workspaces = dfW[dfW["Name"].isin(workspace)]["Name"].tolist() + + for w in workspaces: + dfD = fabric.list_datasets(workspace=w, mode="rest") + for _, r in dfD.iterrows(): + d_name = r["Dataset Name"] + save_semantic_model_metadata( + dataset=d_name, + workspace=workspace, + run_id=run_id, + time_stamp=time_stamp, + ) + + +def _get_run_id(table_name: str) -> int: + + from sempy_labs.lakehouse import get_lakehouse_tables + + if not lakehouse_attached(): + raise ValueError( + f"{icons.red_dot} A lakehouse must be attached to the notebook." + ) + + dfLT = get_lakehouse_tables() + dfLT_filt = dfLT[dfLT["Table Name"] == table_name] + if len(dfLT_filt) == 0: + run_id = 1 + else: + lakehouse_id = fabric.get_lakehouse_id() + lakehouse_name = fabric.resolve_item_name( + item_id=lakehouse_id, type="Lakehouse", workspace=None + ) + + run_id = _get_max_run_id(lakehouse=lakehouse_name, table_name=table_name) + 1 + + return run_id diff --git a/src/sempy_labs/_helper_functions.py b/src/sempy_labs/_helper_functions.py index 8c3bb559..40999de8 100644 --- a/src/sempy_labs/_helper_functions.py +++ b/src/sempy_labs/_helper_functions.py @@ -434,11 +434,7 @@ def save_as_delta_table( TimestampType, ) - if workspace is None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) + (workspace, workspace_id) = resolve_workspace_name_and_id(workspace) if lakehouse is None: lakehouse_id = fabric.get_lakehouse_id() @@ -501,9 +497,15 @@ def save_as_delta_table( ).save(filePath) else: spark_df.write.mode(write_mode).format("delta").save(filePath) - print( - f"{icons.green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse}' lakehouse within the '{workspace}' workspace." - ) + + if write_mode == "append": + print( + f"{icons.green_dot} The dataframe has been appended to the '{delta_table_name}' table in the '{lakehouse}' lakehouse within the '{workspace}' workspace." + ) + else: + print( + f"{icons.green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse}' lakehouse within the '{workspace}' workspace." + ) def language_validate(language: str): @@ -542,13 +544,15 @@ def language_validate(language: str): return lang -def resolve_workspace_name_and_id(workspace: Optional[str] = None) -> Tuple[str, str]: +def resolve_workspace_name_and_id( + workspace: Optional[str | UUID] = None, +) -> Tuple[str, UUID]: """ Obtains the name and ID of the Fabric workspace. Parameters ---------- - workspace : str, default=None + workspace : str | UUID, default=None The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. @@ -561,11 +565,12 @@ def resolve_workspace_name_and_id(workspace: Optional[str] = None) -> Tuple[str, if workspace is None: workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) + workspace_name = fabric.resolve_workspace_name(workspace_id) else: workspace_id = fabric.resolve_workspace_id(workspace) + workspace_name = fabric.resolve_workspace_name(workspace_id) - return str(workspace), str(workspace_id) + return workspace_name, workspace_id def _extract_json(dataframe: pd.DataFrame) -> dict: @@ -785,6 +790,29 @@ def get_capacity_name(workspace: Optional[str] = None) -> str: return dfC_filt["Display Name"].iloc[0] +def _resolve_workspace_capacity_name_id_sku( + workspace: Optional[str | UUID] = None, +) -> Tuple[UUID, str, str, str]: + + (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace) + + filter_condition = urllib.parse.quote(workspace_name) + dfW = fabric.list_workspaces(filter=f"name eq '{filter_condition}'") + capacity_id = dfW["Capacity Id"].iloc[0] + dfC = fabric.list_capacities() + dfC_filt = dfC[dfC["Id"] == capacity_id] + if len(dfC_filt) == 1: + capacity_name = dfC_filt["Display Name"].iloc[0] + sku = dfC_filt["Sku"].iloc[0] + region = dfC_filt["Region"].iloc[0] + else: + capacity_name = None + sku = None + region = None + + return capacity_id, capacity_name, sku, region + + def resolve_capacity_name(capacity_id: Optional[UUID] = None) -> str: """ Obtains the capacity name for a given capacity Id. @@ -1145,3 +1173,44 @@ def _get_max_run_id(lakehouse: str, table_name: str) -> int: max_run_id = dfSpark.collect()[0][0] return max_run_id + + +def _is_valid_uuid( + guid: str, +): + """ + Validates if a string is a valid GUID in version 4 + + Parameters + ---------- + guid : str + GUID to be validated. + + Returns + ------- + bool + Boolean that indicates if the string is a GUID or not. + """ + + try: + UUID(str(guid), version=4) + return True + except ValueError: + return False + + +def _conv_model_size(db_total_size: int): + """ + Converting to KB/MB/GB necessitates division by 1024 * 1000. + """ + + if db_total_size >= 1000000000: + y = db_total_size / (1024**3) * 1000000000 + elif db_total_size >= 1000000: + y = db_total_size / (1024**2) * 1000000 + elif db_total_size >= 1000: + y = db_total_size / (1024) * 1000 + else: + y = db_total_size + + return round(y) diff --git a/src/sempy_labs/_vertipaq.py b/src/sempy_labs/_vertipaq.py index 8afbf5dd..167441f4 100644 --- a/src/sempy_labs/_vertipaq.py +++ b/src/sempy_labs/_vertipaq.py @@ -14,6 +14,7 @@ save_as_delta_table, resolve_workspace_capacity, _get_max_run_id, + _conv_model_size, ) from sempy_labs._list_functions import list_relationships, list_tables from sempy_labs.lakehouse import lakehouse_attached, get_lakehouse_tables @@ -396,19 +397,12 @@ def vertipaq_analyzer( export_Hier = dfH_filt.copy() # Model - # Converting to KB/MB/GB necessitates division by 1024 * 1000. - if db_total_size >= 1000000000: - y = db_total_size / (1024**3) * 1000000000 - elif db_total_size >= 1000000: - y = db_total_size / (1024**2) * 1000000 - elif db_total_size >= 1000: - y = db_total_size / (1024) * 1000 - y = round(y) + model_size = _conv_model_size(db_total_size) dfModel = pd.DataFrame( { "Dataset Name": dataset, - "Total Size": y, + "Total Size": model_size, "Table Count": table_count, "Column Count": column_count, "Compatibility Level": compat_level,