Skip to content

Commit

Permalink
Remove Scarf analytics from Airflow Webserver (apache#43346) (apache#…
Browse files Browse the repository at this point in the history
…43348)

Since we have already made decisions about what we want to do for Plugins in Airflow 3 and revamping the entire UI, this data isn't that important. There were also concerns raised about de-depulications and other things.

(cherry picked from commit dc25301)
  • Loading branch information
kaxil authored Oct 24, 2024
1 parent 0c6c850 commit 2860f81
Show file tree
Hide file tree
Showing 7 changed files with 1 addition and 146 deletions.
24 changes: 0 additions & 24 deletions airflow/utils/usage_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@

from airflow import __version__ as airflow_version, settings
from airflow.configuration import conf
from airflow.plugins_manager import get_plugin_info


def usage_data_collection():
Expand Down Expand Up @@ -97,26 +96,3 @@ def get_executor() -> str:
def get_python_version() -> str:
# Cut only major+minor from the python version string (e.g. 3.10.12 --> 3.10)
return ".".join(platform.python_version().split(".")[0:2])


def get_plugin_counts() -> dict[str, int]:
plugin_info = get_plugin_info()

return {
"plugins": len(plugin_info),
"flask_blueprints": sum(len(x["flask_blueprints"]) for x in plugin_info),
"appbuilder_views": sum(len(x["appbuilder_views"]) for x in plugin_info),
"appbuilder_menu_items": sum(len(x["appbuilder_menu_items"]) for x in plugin_info),
"timetables": sum(len(x["timetables"]) for x in plugin_info),
}


def to_bucket(counter: int) -> str:
"""As we don't want to have preceise numbers, make number into a bucket."""
if counter == 0:
return "0"
buckets = [0, 5, 10, 20, 50, 100, 200, 500, 1000, 2000]
for idx, val in enumerate(buckets[1:]):
if buckets[idx] < counter and counter <= val:
return f"{buckets[idx] + 1}-{val}"
return f"{buckets[-1]}+"
3 changes: 0 additions & 3 deletions airflow/www/templates/airflow/dags.html
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,4 @@ <h2>{{ page_title }}</h2>
return false;
}
</script>
{% if scarf_url %}
<img referrerpolicy="no-referrer" src="{{ scarf_url }}" width="0" height="0" alt="" style="display:none;" />
{% endif %}
{% endblock %}
47 changes: 1 addition & 46 deletions airflow/www/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
from airflow.timetables._cron import CronMixin
from airflow.timetables.base import DataInterval, TimeRestriction
from airflow.timetables.simple import ContinuousTimetable
from airflow.utils import json as utils_json, timezone, usage_data_collection, yaml
from airflow.utils import json as utils_json, timezone, yaml
from airflow.utils.airflow_flask_app import get_airflow_app
from airflow.utils.dag_edges import dag_edges
from airflow.utils.db import get_query_count
Expand Down Expand Up @@ -219,45 +219,6 @@ def get_safe_url(url):
return redirect_url.geturl()


def build_scarf_url(dags_count: int) -> str:
"""
Build the URL for the Scarf usage data collection.
:meta private:
"""
if not settings.is_usage_data_collection_enabled():
return ""

scarf_domain = "https://apacheairflow.gateway.scarf.sh"
platform_sys, platform_arch = usage_data_collection.get_platform_info()
db_version = usage_data_collection.get_database_version()
db_name = usage_data_collection.get_database_name()
executor = usage_data_collection.get_executor()
python_version = usage_data_collection.get_python_version()
plugin_counts = usage_data_collection.get_plugin_counts()
plugins_count = plugin_counts["plugins"]
flask_blueprints_count = plugin_counts["flask_blueprints"]
appbuilder_views_count = plugin_counts["appbuilder_views"]
appbuilder_menu_items_count = plugin_counts["appbuilder_menu_items"]
timetables_count = plugin_counts["timetables"]
dag_bucket = usage_data_collection.to_bucket(dags_count)
plugins_bucket = usage_data_collection.to_bucket(plugins_count)
timetable_bucket = usage_data_collection.to_bucket(timetables_count)

# Path Format:
# /{version}/{python_version}/{platform}/{arch}/{database}/{db_version}/{executor}/{num_dags}/{plugin_count}/{flask_blueprint_count}/{appbuilder_view_count}/{appbuilder_menu_item_count}/{timetables}
#
# This path redirects to a Pixel tracking URL
scarf_url = (
f"{scarf_domain}/webserver"
f"/{version}/{python_version}"
f"/{platform_sys}/{platform_arch}/{db_name}/{db_version}/{executor}/{dag_bucket}"
f"/{plugins_bucket}/{flask_blueprints_count}/{appbuilder_views_count}/{appbuilder_menu_items_count}/{timetable_bucket}"
)

return scarf_url


def get_date_time_num_runs_dag_runs_form_data(www_request, session, dag):
"""Get Execution Data, Base Date & Number of runs from a Request."""
date_time = www_request.args.get("execution_date")
Expand Down Expand Up @@ -1129,11 +1090,6 @@ def _iter_parsed_moved_data_table_names():
"warning",
)

try:
scarf_url = build_scarf_url(dags_count=all_dags_count)
except Exception:
scarf_url = ""

return self.render_template(
"airflow/dags.html",
dags=dags,
Expand Down Expand Up @@ -1173,7 +1129,6 @@ def _iter_parsed_moved_data_table_names():
sorting_direction=arg_sorting_direction,
auto_refresh_interval=conf.getint("webserver", "auto_refresh_interval"),
dataset_triggered_next_run_info=dataset_triggered_next_run_info,
scarf_url=scarf_url,
file_tokens=file_tokens,
)

Expand Down
3 changes: 0 additions & 3 deletions docs/apache-airflow/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,3 @@ The telemetry data collected is limited to the following:
- Operating system & machine architecture
- Executor
- Metadata DB type & its version
- Number of DAGs
- Number of Airflow plugins
- Number of timetables, Flask blueprints, Flask AppBuilder views, and Flask Appbuilder menu items from Airflow plugins
18 changes: 0 additions & 18 deletions tests/utils/test_usage_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from airflow.utils.usage_data_collection import (
get_database_version,
get_python_version,
to_bucket,
usage_data_collection,
)

Expand Down Expand Up @@ -101,20 +100,3 @@ def test_get_database_version(version_info, expected_version):
def test_get_python_version(version_info, expected_version):
with mock.patch("platform.python_version", return_value=version_info):
assert get_python_version() == expected_version


@pytest.mark.parametrize(
"counter, expected_bucket",
[
(0, "0"),
(1, "1-5"),
(5, "1-5"),
(6, "6-10"),
(11, "11-20"),
(20, "11-20"),
(21, "21-50"),
(10000, "2000+"),
],
)
def test_to_bucket(counter, expected_bucket):
assert to_bucket(counter) == expected_bucket
38 changes: 0 additions & 38 deletions tests/www/views/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import pytest
from markupsafe import Markup

from airflow import __version__ as airflow_version
from airflow.configuration import (
initialize_config,
write_default_airflow_configuration_if_needed,
Expand All @@ -36,7 +35,6 @@
from airflow.utils.task_group import TaskGroup
from airflow.www.views import (
ProviderView,
build_scarf_url,
get_key_paths,
get_safe_url,
get_task_stats_from_query,
Expand Down Expand Up @@ -597,39 +595,3 @@ def test_invalid_dates(app, admin_client, url, content):

assert resp.status_code == 400
assert re.search(content, resp.get_data().decode())


@pytest.mark.parametrize("enabled", [False, True])
@patch("airflow.utils.usage_data_collection.get_platform_info", return_value=("Linux", "x86_64"))
@patch("airflow.utils.usage_data_collection.get_database_version", return_value="12.3")
@patch("airflow.utils.usage_data_collection.get_database_name", return_value="postgres")
@patch("airflow.utils.usage_data_collection.get_executor", return_value="SequentialExecutor")
@patch("airflow.utils.usage_data_collection.get_python_version", return_value="3.8")
@patch("airflow.utils.usage_data_collection.get_plugin_counts")
def test_build_scarf_url(
get_plugin_counts,
get_python_version,
get_executor,
get_database_name,
get_database_version,
get_platform_info,
enabled,
):
get_plugin_counts.return_value = {
"plugins": 10,
"flask_blueprints": 15,
"appbuilder_views": 20,
"appbuilder_menu_items": 25,
"timetables": 30,
}
with patch("airflow.settings.is_usage_data_collection_enabled", return_value=enabled):
result = build_scarf_url(5)
expected_url = (
"https://apacheairflow.gateway.scarf.sh/webserver/"
f"{airflow_version}/3.8/Linux/x86_64/postgres/12.3/SequentialExecutor/1-5"
f"/6-10/15/20/25/21-50"
)
if enabled:
assert result == expected_url
else:
assert result == ""
14 changes: 0 additions & 14 deletions tests/www/views/test_views_home.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,20 +454,6 @@ def test_sorting_home_view(url, lower_key, greater_key, user_client, working_dag
assert lower_index < greater_index


@pytest.mark.parametrize("is_enabled, should_have_pixel", [(False, False), (True, True)])
def test_analytics_pixel(user_client, is_enabled, should_have_pixel):
"""
Test that the analytics pixel is not included when the feature is disabled
"""
with mock.patch("airflow.settings.is_usage_data_collection_enabled", return_value=is_enabled):
resp = user_client.get("home", follow_redirects=True)

if should_have_pixel:
check_content_in_response("apacheairflow.gateway.scarf.sh", resp)
else:
check_content_not_in_response("apacheairflow.gateway.scarf.sh", resp)


@pytest.mark.parametrize(
"url, filter_tags_cookie_val, filter_lastrun_cookie_val, expected_filter_tags, expected_filter_lastrun",
[
Expand Down

0 comments on commit 2860f81

Please sign in to comment.