diff --git a/README.md b/README.md index 55ae1f2..738ff25 100644 --- a/README.md +++ b/README.md @@ -1 +1,9 @@ -# scikit-learn analysis +# Getting started + +These are the steps to run the project in your local machine: + +1. Install the MyST Markdown Command Line Interface. You can see the instructions in the following link: https://mystmd.org/guide/installing + +2. Open the directory called "sklearn-survey-report-2024" and open a new terminal in that location. + +3. Run the command "myst start". diff --git a/images/chart1.png b/images/chart1.png new file mode 100644 index 0000000..129ed99 Binary files /dev/null and b/images/chart1.png differ diff --git a/images/chart10.png b/images/chart10.png index cd8863f..0a2ffe7 100644 Binary files a/images/chart10.png and b/images/chart10.png differ diff --git a/images/chart12.png b/images/chart12.png deleted file mode 100644 index 6613155..0000000 Binary files a/images/chart12.png and /dev/null differ diff --git a/images/chart15.png b/images/chart15.png deleted file mode 100644 index 4d884a0..0000000 Binary files a/images/chart15.png and /dev/null differ diff --git a/images/chart16.png b/images/chart16.png deleted file mode 100644 index c12676b..0000000 Binary files a/images/chart16.png and /dev/null differ diff --git a/images/chart17.png b/images/chart17.png deleted file mode 100644 index 015ddc6..0000000 Binary files a/images/chart17.png and /dev/null differ diff --git a/images/chart18.png b/images/chart18.png deleted file mode 100644 index 51516f4..0000000 Binary files a/images/chart18.png and /dev/null differ diff --git a/images/chart2-2.png b/images/chart2-2.png deleted file mode 100644 index 1b323ca..0000000 Binary files a/images/chart2-2.png and /dev/null differ diff --git a/images/chart2.png b/images/chart2.png index f8e30ed..3759528 100644 Binary files a/images/chart2.png and b/images/chart2.png differ diff --git a/images/chart20.png b/images/chart20.png deleted file mode 100644 index 8c83054..0000000 Binary files a/images/chart20.png and /dev/null differ diff --git a/images/chart3.png b/images/chart3.png new file mode 100644 index 0000000..146fdaf Binary files /dev/null and b/images/chart3.png differ diff --git a/images/chart4.png b/images/chart4.png new file mode 100644 index 0000000..d037f4e Binary files /dev/null and b/images/chart4.png differ diff --git a/images/chart5-2.png b/images/chart5-2.png deleted file mode 100644 index 9a0eb8e..0000000 Binary files a/images/chart5-2.png and /dev/null differ diff --git a/images/chart5.png b/images/chart5.png index ba75871..f2b010c 100644 Binary files a/images/chart5.png and b/images/chart5.png differ diff --git a/images/chart6.png b/images/chart6.png index b2711cf..ed44298 100644 Binary files a/images/chart6.png and b/images/chart6.png differ diff --git a/images/chart7.png b/images/chart7.png index 3254c59..856b13b 100644 Binary files a/images/chart7.png and b/images/chart7.png differ diff --git a/images/chart8.png b/images/chart8.png new file mode 100644 index 0000000..8b4af2a Binary files /dev/null and b/images/chart8.png differ diff --git a/images/chart9.png b/images/chart9.png index be1053c..9bb7ad7 100644 Binary files a/images/chart9.png and b/images/chart9.png differ diff --git a/scikit-learn-analysis.ipynb b/scikit-learn-analysis.ipynb index e0f64c0..c2a7444 100644 --- a/scikit-learn-analysis.ipynb +++ b/scikit-learn-analysis.ipynb @@ -18,20 +18,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "```{dropdown} Libraries and data\n", + "# Libraries and Data\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", - "\n", - "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", + "\n", + "import pandas as pd\n", + "import matplotlib.colors as mcolors\n", "import numpy as np\n", + "import altair as alt\n", "\n", "url = 'https://raw.githubusercontent.com/Auslum/scikit_learn_survey/refs/heads/main/scikit-learn-survey-master-dataset.csv'\n", "\n", "df = pd.read_csv(url)\n", "\n", - "#print(df.head())" + "#print(df.head()) " ] }, { @@ -40,19 +44,16 @@ "source": [ "# Project Future Direction and Priorities\n", "\n", - "```{dropdown} Show code\n", + "In the survey, new features, performance, and technical documentation have the highest priority for the users. While website redesign and other have the lowest.\n", "\n", - "```python\n", - "# Scikit-learn logo colors (blue and orange)\n", - "scikit_learn_colors = [\"#0072B2\", \"#FF9900\"]\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", - "# Generate a color interpolation for the priority levels\n", - "priority_colors = [\n", - " mcolors.to_hex(c)\n", - " for c in mcolors.LinearSegmentedColormap.from_list(\"ScikitLearn\", scikit_learn_colors)(np.linspace(0, 1, len(priority_levels)))\n", - "]\n", + "```python\n", + "# Define priority levels from 1 to 7\n", + "priority_levels = list(range(1, 8))\n", "\n", - "# Filter the columns related to the question\n", + "# Filter columns related to the question\n", "priority_columns = [col for col in df.columns if \"PROJECT FUTURE DIRECTION AND PRIORITIES\" in col]\n", "priority_data = df[priority_columns].dropna()\n", "\n", @@ -64,89 +65,56 @@ "]\n", "priority_data.columns = renamed_columns\n", "\n", - "# Prepare data for a stacked bar chart\n", - "stacked_bar_data = pd.DataFrame({\n", - " category: priority_data[category].value_counts().sort_index()\n", - " for category in renamed_columns\n", - "}).fillna(0).astype(int).T\n", - "\n", - "priority_levels = [int(level) for level in stacked_bar_data.columns]\n", - "categories = stacked_bar_data.index\n", - "\n", - "# Create the stacked bar chart\n", - "plt.figure(figsize=(12, 6))\n", - "bottoms = np.zeros(len(categories))\n", + "# Convert the data to a long format for Altair\n", + "priority_data_melted = priority_data.melt(var_name=\"Category\", value_name=\"Priority\")\n", "\n", - "for level, color in zip(priority_levels, priority_colors):\n", - " plt.bar(categories, stacked_bar_data[level], bottom=bottoms, label=f'Priority {level}', color=color)\n", - " bottoms += stacked_bar_data[level]\n", - "\n", - "# Customize the chart\n", - "plt.title(\"PROJECT FUTURE DIRECTION AND PRIORITIES\", fontsize=14, fontweight='bold')\n", - "plt.xlabel(\"Categories\")\n", - "plt.ylabel(\"Frequency\")\n", - "plt.xticks(rotation=45, ha='right')\n", - "\n", - "# Customize the legend with explanations\n", - "priority_labels = [\n", - " \"1 (Lowest Priority)\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7 (Highest Priority)\"\n", + "# Create interpolated colors from blue to orange\n", + "scikit_learn_colors = [\"#0072B2\", \"#FF9900\"]\n", + "priority_colors = [\n", + " mcolors.to_hex(c)\n", + " for c in mcolors.LinearSegmentedColormap.from_list(\"ScikitLearn\", scikit_learn_colors)(\n", + " np.linspace(0, 1, len(priority_levels))\n", + " )\n", "]\n", - "plt.legend(\n", - " labels=priority_labels,\n", - " title=\"Level of Priority\", bbox_to_anchor=(1.05, 1), loc=\"upper left\", fontsize=10\n", + "\n", + "# Create custom labels for the legend\n", + "priority_labels = {1: \"1 (Lowest priority)\", 2: \"2\", 3: \"3\", 4: \"4\", 5: \"5\", 6: \"6\", 7: \"7 (Highest priority)\"}\n", + "priority_data_melted['Priority Label'] = priority_data_melted['Priority'].map(priority_labels)\n", + "\n", + "# Create the stacked bar chart with Altair\n", + "chart = alt.Chart(priority_data_melted).mark_bar().encode(\n", + " x=alt.X('Category:N', title='Categories', sort=renamed_columns),\n", + " y=alt.Y(\n", + " 'count()',\n", + " title='Frequency',\n", + " scale=alt.Scale(domain=[0, 350])\n", + " ),\n", + " color=alt.Color(\n", + " 'Priority Label:N',\n", + " scale=alt.Scale(\n", + " domain=list(priority_labels.values()),\n", + " range=priority_colors\n", + " ),\n", + " title='Priority Level'\n", + " ),\n", + " order=alt.Order('Priority:Q', sort='ascending')\n", + ").properties(\n", + " title=\"Project Future Direction And Priorities\",\n", + " width=600,\n", + " height=400\n", + ").configure_axis(\n", + " labelAngle=45\n", ")\n", "\n", - "# Adjust and display the chart\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "# Spider chart\n", - "# Prepare data for calculation of weighted averages\n", - "stacked_bar_data = pd.DataFrame({\n", - " category: priority_data[category].value_counts().sort_index()\n", - " for category in renamed_columns\n", - "}).fillna(0).astype(int).T\n", - "\n", - "# Calculate weighted averages for each category\n", - "priority_levels = np.array([1, 2, 3, 4, 5, 6, 7, 8])\n", - "weighted_scores = (stacked_bar_data * priority_levels).sum(axis=1) / stacked_bar_data.sum(axis=1)\n", - "\n", - "# Verify new data\n", - "# print(\"Weighted averages per category:\\n\", weighted_scores)\n", - "\n", - "# Prepare the spider chart\n", - "labels = weighted_scores.index\n", - "values = weighted_scores.values\n", - "num_vars = len(labels)\n", - "\n", - "# Ensure that the graph is closed (the first value is repeated at the end)\n", - "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n", - "values = np.concatenate((values, [values[0]]))\n", - "angles += angles[:1]\n", - "\n", - "# Create the spider chart\n", - "fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))\n", - "ax.fill(angles, values, color='blue', alpha=0.25)\n", - "ax.plot(angles, values, color='blue', linewidth=2)\n", - "\n", - "# Adjust the tags\n", - "ax.set_yticks([])\n", - "ax.set_xticks(angles[:-1])\n", - "ax.set_xticklabels(labels, fontsize=10)\n", - "ax.set_title(\"PROJECT FUTURE DIRECTION AND PRIORITIES\", fontsize=14, fontweight='bold', pad=20)\n", - "\n", - "# Show the chart\n", - "plt.tight_layout()\n", - "plt.show()" + "# Display the chart\n", + "chart.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Project Future Direction and Priorities](images/chart2.png)\n", - "***\n", - "![Project Future Direction and Priorities](images/chart2-2.png)\n", + "![Project Future Direction and Priorities](images/chart1.png)\n", "***" ] }, @@ -156,79 +124,78 @@ "source": [ "# ML Tasks: Priority Levels\n", "\n", - "```{dropdown} Show code\n", + "Talking about Machine Learning tasks, classification and regression have the highest priorities, while other, outlier/anomaly detection, and clustering have the lowest ones.\n", "\n", - "```python\n", - "\n", - "# Scikit-learn logo colors (blue and orange)\n", - "scikit_learn_colors = [\"#0072B2\", \"#FF9900\"]\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", - "# Generate a color interpolation for the priority levels\n", - "priority_colors = [\n", - " mcolors.to_hex(c)\n", - " for c in mcolors.LinearSegmentedColormap.from_list(\"ScikitLearn\", scikit_learn_colors)(np.linspace(0, 1, len(priority_levels)))\n", - "]\n", + "```python\n", "\n", - "# Identify the columns related to the question about priorities for ML tasks\n", - "ml_task_columns = [\n", - " col for col in df.columns\n", - " if \"Please order the following ML tasks in order of priority to you\" in col\n", - "]\n", + "# Define priority levels from 1 to 7\n", + "priority_levels = list(range(1, 8))\n", "\n", - "# Filter relevant data\n", - "ml_task_data = df[ml_task_columns].dropna()\n", + "# Filter columns related to the question\n", + "priority_columns = [col for col in df.columns if \"Please order the following ML tasks in order of priority to you\" in col]\n", + "priority_data = df[priority_columns].dropna()\n", "\n", - "# Rename columns\n", - "renamed_ml_task_columns = [\n", + "# Rename the categories\n", + "renamed_columns = [\n", " \"Regression\", \"Classification\", \"Forecasting\",\n", " \"Outlier/anomaly detection\", \"Dimensionality reduction\",\n", " \"Clustering\", \"Other\"\n", "]\n", - "ml_task_data.columns = renamed_ml_task_columns\n", - "\n", - "# Stacked bar chart\n", - "# Reindex with available priority levels (1 to 7)\n", - "priority_levels = [1, 2, 3, 4, 5, 6, 7]\n", - "\n", - "# Prepare data for the stacked bar chart\n", - "stacked_bar_data = pd.DataFrame({\n", - " category: ml_task_data[category].value_counts().reindex(priority_levels, fill_value=0)\n", - " for category in ml_task_data.columns\n", - "}).T\n", - "\n", - "# Create the stacked bar chart\n", - "plt.figure(figsize=(12, 6))\n", - "bottoms = np.zeros(len(stacked_bar_data))\n", - "\n", - "# Loop through priority levels and apply custom colors\n", - "for level, color in zip(priority_levels, priority_colors):\n", - " plt.bar(stacked_bar_data.index, stacked_bar_data[level], bottom=bottoms, label=f'Priority {level}', color=color)\n", - " bottoms += stacked_bar_data[level]\n", - "\n", - "# Customize the chart\n", - "plt.title(\"ML Tasks: Priority Levels\", fontsize=14, fontweight='bold')\n", - "plt.xlabel(\"Categories\")\n", - "plt.ylabel(\"Number of Responses\")\n", - "\n", - "priority_labels = [\n", - " \"1 (Lowest Priority)\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7 (Highest Priority)\"\n", + "priority_data.columns = renamed_columns\n", + "\n", + "# Convert the data to a long format for Altair\n", + "priority_data_melted = priority_data.melt(var_name=\"Category\", value_name=\"Priority\")\n", + "\n", + "# Create interpolated colors from blue to orange\n", + "scikit_learn_colors = [\"#0072B2\", \"#FF9900\"]\n", + "priority_colors = [\n", + " mcolors.to_hex(c)\n", + " for c in mcolors.LinearSegmentedColormap.from_list(\"ScikitLearn\", scikit_learn_colors)(\n", + " np.linspace(0, 1, len(priority_levels))\n", + " )\n", "]\n", - "plt.legend(\n", - " labels=priority_labels,\n", - " title=\"Priority Level\", bbox_to_anchor=(1.05, 1), loc=\"upper left\", fontsize=10\n", + "\n", + "# Create custom labels for the legend\n", + "priority_labels = {1: \"1 (Lowest priority)\", 2: \"2\", 3: \"3\", 4: \"4\", 5: \"5\", 6: \"6\", 7: \"7 (Highest priority)\"}\n", + "priority_data_melted['Priority Label'] = priority_data_melted['Priority'].map(priority_labels)\n", + "\n", + "# Create the stacked bar chart with Altair\n", + "chart = alt.Chart(priority_data_melted).mark_bar().encode(\n", + " x=alt.X('Category:N', title='Categories', sort=renamed_columns),\n", + " y=alt.Y(\n", + " 'count()',\n", + " title='Frequency',\n", + " scale=alt.Scale(domain=[0, 350])\n", + " ),\n", + " color=alt.Color(\n", + " 'Priority Label:N',\n", + " scale=alt.Scale(\n", + " domain=list(priority_labels.values()),\n", + " range=priority_colors\n", + " ),\n", + " title='Priority Level'\n", + " ),\n", + " order=alt.Order('Priority:Q', sort='ascending')\n", + ").properties(\n", + " title=\"ML Tasks: Priority Levels\",\n", + " width=600,\n", + " height=400\n", + ").configure_axis(\n", + " labelAngle=45\n", ")\n", "\n", - "# Rotate category labels and adjust layout\n", - "plt.xticks(rotation=45, ha='right')\n", - "plt.tight_layout()\n", - "plt.show()" + "# Display the chart\n", + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![ML Tasks: Priority Levels](images/chart5.png)\n", + "![ML Tasks: Priority Levels](images/chart2.png)\n", "***" ] }, @@ -238,62 +205,15 @@ "source": [ "# Visualizations used to evaluate models\n", "\n", - "```{dropdown} Show code\n", + "The majority of respondents use confusion matrix the most as visualization to evaluate models, feature importance and ROC curve are commonly used as well. Residual plots and reliability diagrams are the least used.\n", + "\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", "\n", - "def plot_response_counts2(df, column_name, mapping_dict, title, color, y_ax_name):\n", - " \"\"\"\n", - " General function to plot response counts for survey questions with multiple answers.\n", - "\n", - " Parameters:\n", - " df (DataFrame): The survey data.\n", - " column_name (str): The column name containing the responses.\n", - " mapping_dict (dict): Dictionary mapping responses in various languages to English categories.\n", - " title (str): The title for the graph.\n", - " color (str): The color of the bars.\n", - " y_ax_name (str): Label for the y-axis.\n", - " \"\"\"\n", - " # Function to normalize responses using the mapping dictionary\n", - " def normalize_responses(response):\n", - " if isinstance(response, str):\n", - " # Split the responses by comma, strip extra spaces, and map to English categories\n", - " response_split = [r.strip() for r in response.split(',')]\n", - " normalized = [mapping_dict.get(r, None) for r in response_split]\n", - " # Filter out None values (unmapped responses)\n", - " return [r for r in normalized if r is not None]\n", - " return []\n", - "\n", - " # Apply the normalization to the responses\n", - " df['Normalized_Responses'] = df[column_name].apply(normalize_responses)\n", - "\n", - " # Flatten the normalized response lists\n", - " all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", - "\n", - " # Count the answers and show the number of times each option is chosen\n", - " response_counts = pd.Series(all_responses).value_counts()\n", - "\n", - " # Sort the responses from largest to smallest\n", - " response_counts = response_counts.sort_values(ascending=True)\n", - "\n", - " # Create the horizontal bar chart\n", - " plt.figure(figsize=(14, 6))\n", - " ax = response_counts.plot(kind='barh', color=color)\n", - "\n", - " # Add data tags\n", - " for index, value in enumerate(response_counts):\n", - " ax.text(value + 2, index, str(value), va='center', ha='left', fontsize=12, fontweight='regular')\n", - "\n", - " # Title and labels\n", - " plt.title(title, fontsize=16, fontweight='bold')\n", - " plt.xlabel('Number of Responses', fontsize=12)\n", - " plt.ylabel(y_ax_name, fontsize=12)\n", - "\n", - " # Adjust and show the chart\n", - " plt.tight_layout()\n", - " plt.show()\n", - "\n", - " mapping_dict = {\n", + "# Mapping dictionary\n", + "mapping_dict = {\n", " # Confusion matrix responses\n", " \"Confusion matrix\": \"Confusion matrix\",\n", " \"Matriz de confusão\": \"Confusion matrix\",\n", @@ -351,21 +271,39 @@ " \"أخرى\": \"Other\"\n", "}\n", "\n", - "plot_response_counts2(\n", - " df=df,\n", - " column_name='What visualizations do you use to evaluate your models? Select all that apply.',\n", - " mapping_dict=mapping_dict,\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['What visualizations do you use to evaluate your models? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Visualization', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Visualization:N', sort='-x'),\n", + " tooltip=['Visualization', 'Count']\n", + ").properties(\n", " title='Visualizations used to evaluate models',\n", - " color='green',\n", - " y_ax_name='Visualizations'\n", - ")" + " width=500,\n", + " height=300\n", + ")\n", + "\n", + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Visualizations used to evaluate models](images/chart6.png)\n", + "![Visualizations used to evaluate models](images/chart3.png)\n", "***" ] }, @@ -375,10 +313,14 @@ "source": [ "# Dataframe libraries used\n", "\n", - "```{dropdown} Show code\n", + "Pandas is the most used library for the majority of respondents. Other libraries are not so popular compared to pandas, being Modin the least used.\n", + "\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", "\n", + "# Mapping dictionary\n", "mapping_dict = {\n", " # cudf responses\n", " \"cudf\": \"cudf\",\n", @@ -413,21 +355,39 @@ " \"أخرى\": \"Other\"\n", "}\n", "\n", - "plot_response_counts2(\n", - " df=df,\n", - " column_name='Which DataFrame libraries do you use? Select all that apply.',\n", - " mapping_dict=mapping_dict,\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['Which DataFrame libraries do you use? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Libraries', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Libraries:N', sort='-x'),\n", + " tooltip=['Libraries', 'Count']\n", + ").properties(\n", " title='DataFrame libraries used',\n", - " color='blue',\n", - " y_ax_name='Libraries'\n", - ")" + " width=500,\n", + " height=300\n", + ")\n", + "\n", + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Dataframe libraries used](images/chart7.png)\n", + "![Dataframe libraries used](images/chart4.png)\n", "***" ] }, @@ -437,10 +397,14 @@ "source": [ "# Machine Learning libraries used\n", "\n", - "```{dropdown} Show code\n", + "For Machine Learning, XGBoost and PyTorch are the most used libraries, while Jax is the least used.\n", + "\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", "\n", + "# Mapping dictionary\n", "mapping_dict = {\n", " # CatBoost responses\n", " \"CatBoost\": \"CatBoost\",\n", @@ -472,21 +436,39 @@ " \"أخرى\": \"Other\"\n", "}\n", "\n", - "plot_response_counts2(\n", - " df=df,\n", - " column_name='Which other Machine Learning libraries do you use? Select all that apply.',\n", - " mapping_dict=mapping_dict,\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['Which other Machine Learning libraries do you use? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Libraries', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Libraries:N', sort='-x'),\n", + " tooltip=['Libraries', 'Count']\n", + ").properties(\n", " title='Machine Learning libraries used',\n", - " color='orange',\n", - " y_ax_name='Libraries'\n", - ")" + " width=500,\n", + " height=300\n", + ")\n", + "\n", + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Machine Learning libraries used](images/chart9.png)\n", + "![Machine Learning libraries used](images/chart5.png)\n", "***" ] }, @@ -496,11 +478,15 @@ "source": [ "# Estimators regularly used\n", "\n", - "```{dropdown} Show code\n", + "RandomForestClassifier and RandomForestRegressor are the most used estimators, other estimators that are not listed in the options given in the survey, HistGradientBoostingRegresor and HistGradientBoostingClassifier are the least used.\n", + "\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", + "# Mapping dictionary\n", "mapping_dict = {\n", - " # LogisticRegression responses\n", + " # LogisticRegression responses\n", " \"LogisticRegression\": \"LogisticRegression\",\n", " \"RandomForestClassifier أو RandomForestRegressorLogisticRegression الانحدار اللوجستي\": \"LogisticRegression\",\n", " # RandomForestClassifier or RandomForestRegressor responses\n", @@ -530,141 +516,39 @@ " \"أخرى\": \"Other\"\n", "}\n", "\n", - "plot_response_counts2(\n", - " df=df,\n", - " column_name='Which estimators do you regularly use? Select all that apply.',\n", - " mapping_dict=mapping_dict,\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['Which estimators do you regularly use? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Estimators', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Estimators:N', sort='-x'),\n", + " tooltip=['Estimators', 'Count']\n", + ").properties(\n", " title='Estimators Regularly Used',\n", - " color='purple',\n", - " y_ax_name='Estimators'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Estimators regularly used](images/chart10.png)\n", - "***" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Ever written an estimator\n", - "\n", - "```{dropdown} Show code\n", - "\n", - "```python" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(image here)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Important ML features for use case\n", - "\n", - "```{dropdown} Show code\n", - "\n", - "```python\n", - "\n", - "# Define the relevant question\n", - "question_column = \"What ML features are important for your use case? Select all that apply.\"\n", - "\n", - "# Check if the column exists in the dataset\n", - "if question_column in data.columns:\n", - " # Count occurrences of each response\n", - " response_counts = data[question_column].value_counts()\n", - "\n", - " # Sort responses in the desired order\n", - " response_order = [\n", - " \"Calibration of probabilistic classifiers\",\n", - " \"Calibration of regressors\",\n", - " \"Uncertainty estimates for prediction\",\n", - " \"Cost-sensitive learning\",\n", - " \"Feature importances\",\n", - " \"Sample weights\",\n", - " \"Metadata routing\",\n", - " \"Non-euclidean metrics\"\n", - " ]\n", - " response_counts = response_counts.reindex(response_order, fill_value=0)\n", - "\n", - " # Plot the bar graph\n", - " plt.figure(figsize=(12, 8))\n", - " response_counts.plot(kind='bar', color='purple')\n", - "\n", - " # Add titles and labels\n", - " plt.title(\"Responses to the Importance of Open Source ML/AI Frameworks\", fontsize=14, fontweight=\"bold\")\n", - " plt.xlabel(\"Response\", fontsize=12)\n", - " plt.ylabel(\"Number of Respondents\", fontsize=12)\n", - " plt.xticks(rotation=45, ha='right')\n", - "\n", - " # Annotate the bars\n", - " for i, count in enumerate(response_counts):\n", - " plt.text(i, count + 0.5, str(count), ha='center', fontsize=10)\n", - "\n", - " # Show the plot\n", - " plt.tight_layout()\n", - " plt.show()\n", - "else:\n", - " print(f\"Column '{question_column}' not found in the dataset.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(image here)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Additional information to pass to an estimator\n", - "\n", - "```{dropdown} Show code\n", - "\n", - "```python\n", + " width=500,\n", + " height=300\n", + ")\n", "\n", - "# Extract responses to the question\n", - "question_column = \"Is there additional information you want to pass to an estimator that is not X and Y?\"\n", - "if question_column in data.columns:\n", - " response_counts = data[question_column].value_counts()\n", - "\n", - " # Plot the bar graph\n", - " plt.figure(figsize=(8, 6))\n", - " response_counts.plot(kind='bar', color=['teal', 'orange'], )\n", - "\n", - " # Add titles and labels\n", - " plt.title(\"Responses to Additional Information for Estimator (X and Y)\", fontsize=14, fontweight='bold')\n", - " plt.xlabel(\"Response\", fontsize=12)\n", - " plt.ylabel(\"Count\", fontsize=12)\n", - "\n", - " # Annotate bars\n", - " for i, count in enumerate(response_counts):\n", - " plt.text(i, count + 0.5, str(count), ha='center', fontsize=10)\n", - "\n", - " # Show the plot\n", - " plt.tight_layout()\n", - " plt.show()\n", - "else:\n", - " print(f\"Column '{question_column}' not found in the dataset.\")" + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Additional information to pass to an estimator](images/chart12.png)\n", + "![Estimators regularly used](images/chart6.png)\n", "***" ] }, @@ -672,67 +556,107 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# How critical would GPU capabilities within sk learn be\n", - "\n", - "```{dropdown} Show code\n", - "\n", - "```python\n", - "\n", - "#print(df.head())\n", - "\n", + "# Important ML features\n", "\n", - "# Identify the columns related to the question about priorities for Deployment\n", - "deployment_columns = [\n", - " col for col in df.columns\n", - " if \"Considering your current machine learning projects, how critical would GPU capabilities within scikit-learn be?\" in col\n", - "]\n", - "\n", - "# print(\"Identified columns for Deployment:\\n\",deploy_columns)\n", + "The respondents consider that the most important Machine Learning features for use case are feature importances, and uncertainity estimates for prediction, and they consider that the least important are metadata routing and non-euclidean metrics.\n", "\n", - "# Filter relevant data\n", - "deploy_data = df[deployment_columns].dropna()\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", - "# Rename columns\n", - "renamed_deploy_columns = [\n", - " \"\"\n", - "]\n", - "deploy_data.columns = renamed_deploy_columns\n", - "\n", - "# Stacked bar chart\n", - "# Reindex with available priority levels (1 to 5)\n", - "priority_levels = [1, 2, 3, 4, 5]\n", - "\n", - "# Prepare data for the stacked bar chart\n", - "stacked_bar_data = pd.DataFrame({\n", - " category: deploy_data[category].value_counts().reindex(priority_levels, fill_value=0)\n", - " for category in deploy_data.columns\n", - "}).T\n", - "\n", - "# Print to verify processed data\n", - "#print(\"\\nPrepared data for the stacked bar chart:\")\n", - "#print(stacked_bar_data)\n", + "```python\n", + "# Mapping dictionary\n", + "mapping_dict = {\n", + " # Calibration of probabilistic classifiers responses\n", + " \"Calibration of probabilistic classifiers\": \"Calibration of probabilistic classifiers\",\n", + " \"Calibração de classificadores probabilísticos\": \"Calibration of probabilistic classifiers\",\n", + " \"Calibración de clasificadores probabilísticos\": \"Calibration of probabilistic classifiers\",\n", + " \"概率分类器(probabilistic classifiers)的校准\": \"Calibration of probabilistic classifiers\",\n", + " \"Calibration des classificateurs probabilistes\": \"Calibration of probabilistic classifiers\",\n", + " \"معايرة المصنفات الاحتمالية\": \"Calibration of probabilistic classifiers\",\n", + " # Calibration of regressors responses\n", + " \"Calibration of regressors\": \"Calibration of regressors\",\n", + " \"Calibração de regressores\": \"Calibration of regressors\",\n", + " \"Calibración de regresores\": \"Calibration of regressors\",\n", + " \"回归子(regressor)的校准\": \"Calibration of regressors\",\n", + " \"Calibration des régressions\": \"Calibration of regressors\",\n", + " \"معايرة نماذج الانحدار\": \"Calibration of regressors\",\n", + " # Uncertainty estimates for prediction responses\n", + " \"Uncertainty estimates for prediction\": \"Uncertainty estimates for prediction\",\n", + " \"Estimativas de incerteza para previsão\": \"Uncertainty estimates for prediction\",\n", + " \"Estimaciones de incertidumbre para la predicción\": \"Uncertainty estimates for prediction\",\n", + " \"对预测的不确定性估计\": \"Uncertainty estimates for prediction\",\n", + " \"Estimations d'incertitude pour les prédictions\": \"Uncertainty estimates for prediction\",\n", + " \"تقديرات عدم اليقين للتنبؤ\": \"Uncertainty estimates for prediction\",\n", + " # Cost-sensitive learning responses\n", + " \"Cost-sensitive learning\": \"Cost-sensitive learning\",\n", + " \"Aprendizado sensível a custo\": \"Cost-sensitive learning\",\n", + " \"Aprendizaje sensible al costo (cost-sensitive learning)\": \"Cost-sensitive learning\",\n", + " \"代价敏感学习\": \"Cost-sensitive learning\",\n", + " \"Apprentissage sensible aux coûts (Cost-sensitive learning)\": \"Cost-sensitive learning\",\n", + " \"التعلم الحساس للتكلفة\": \"Cost-sensitive learning\",\n", + " # Feature importances responses\n", + " \"Feature importances\": \"Feature importances\",\n", + " \"Importância das características\": \"Feature importances\",\n", + " \"Importancia de variables\": \"Feature importances\",\n", + " \"特征重要性\": \"Feature importances\",\n", + " \"Importance des caractéristiques (features)\": \"Feature importances\",\n", + " \"الأهمية النسبية للخواص\": \"Feature importances\",\n", + " # Sample weights responses\n", + " \"Sample weights\": \"Sample weights\",\n", + " \"Pesos de amostra\": \"Sample weights\",\n", + " \"Pesos de muestra (sample weights)\": \"Sample weights\",\n", + " \"样本权重\": \"Sample weights\",\n", + " \"Poids des échantillons(Sample Weights)\": \"Sample weights\",\n", + " \"أوزان العينات\": \"Sample weights\",\n", + " # Metadata routing responses\n", + " \"Metadata routing\": \"Metadata routing\",\n", + " \"Roteamento de metadados\": \"Metadata routing\",\n", + " \"Enrutamiento de metadatos\": \"Metadata routing\",\n", + " \"元数据路由(Metadata routing)\": \"Metadata routing\",\n", + " \"Routage des métadonnées\": \"Metadata routing\",\n", + " \"توجيه البيانات الوصفية\": \"Metadata routing\",\n", + " # Non-euclidean metrics responses\n", + " \"Non-euclidean metrics\": \"Non-euclidean metrics\",\n", + " \"Métricas não-euclidianas\": \"Non-euclidean metrics\",\n", + " \"Métricas no-euclidianas\": \"Non-euclidean metrics\",\n", + " \"非欧几里得度量(Non-Euclidean Metric)\": \"Non-euclidean metrics\",\n", + " \"Métriques non-euclidiennes\": \"Non-euclidean metrics\",\n", + " \"المقاييس غير الإقليدية\": \"Non-euclidean metrics\",\n", + "}\n", "\n", - "# Create the stacked bar chart\n", - "stacked_bar_data.plot(\n", - " kind=\"bar\", stacked=True, figsize=(12, 6), colormap=\"viridis\", edgecolor=\"none\"\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['What ML features are important for your use case? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Features', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Features:N', sort='-x'),\n", + " tooltip=['Features', 'Count']\n", + ").properties(\n", + " title='Important ML features',\n", + " width=500,\n", + " height=300\n", ")\n", - "plt.title(\"Deployment: Priority Levels\", fontsize=14, fontweight='bold')\n", - "plt.xlabel(\"\")\n", - "plt.ylabel(\"Number of Responses\")\n", - "plt.legend(\n", - " title=\"Priority Level\", bbox_to_anchor=(1.05, 1), loc=\"upper left\", fontsize=10\n", - ")\n", - "plt.xticks(rotation=45, ha='right')\n", "\n", - "plt.tight_layout()\n", - "plt.show()" + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![How critical would GPU capabilities within sk learn be](images/chart15.png)\n", + "![Important ML features](images/chart7.png)\n", "***" ] }, @@ -742,55 +666,76 @@ "source": [ "# Tools used for model registry and experiment tracking\n", "\n", - "```{dropdown} Show code\n", + "The most popular tool is MLFlow. Respondants do not use other tools as much as MLFlow, being Neptune the least popular among them.\n", + "\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", + "# Mapping dictionary\n", + "mapping_dict = {\n", + " # DVC responses\n", + " \"DVC\": \"DVC\",\n", + " \"DVC دي في سي\": \"DVC\",\n", + " # Neptune responses\n", + " \"Neptune\": \"Neptune\",\n", + " \"Neptune نبتون\": \"Neptune\",\n", + " # MLFlow responses\n", + " \"MLFlow\": \"MLFlow\",\n", + " \"MLFlow ام ال فلو\": \"MLFlow\",\n", + " # Weight and biases responses\n", + " \"Weight and biases\": \"Weight and biases\",\n", + " \"Weights and Biases\": \"Weight and biases\",\n", + " \"Weight and biases الوزن و الانحيازات\": \"Weight and biases\",\n", + " # Custom tool responses\n", + " \"Custom tool\": \"Custom tool\",\n", + " \"Ferramenta personalizada\": \"Custom tool\",\n", + " \"Herramientas personalizadas\": \"Custom tool\",\n", + " \"自定义工具\": \"Custom tool\",\n", + " \"Outil personnalisé\": \"Custom tool\",\n", + " \"أداة مخصصة\": \"Custom tool\",\n", + " # Other responses\n", + " \"Other\": \"Other\",\n", + " \"Outro\": \"Other\",\n", + " \"Otro\": \"Other\",\n", + " \"其它\": \"Other\",\n", + " \"Autre\": \"Other\",\n", + " \"أخرى\": \"Other\"\n", + "}\n", "\n", - "# Define the relevant question\n", - "question_column = \"For model registry and experiment tracking, do you use any of the following tools? Select all that apply.\"\n", - "\n", - "# Check if the column exists in the dataset\n", - "if question_column in data.columns:\n", - " # Count occurrences of each response\n", - " response_counts = data[question_column].value_counts()\n", - "\n", - " # Sort responses in the desired order\n", - " response_order = [\n", - " \"DVC\",\n", - " \"Neptune\",\n", - " \"MlFlow\",\n", - " \"Weight and biases\",\n", - " \"Custom tool\",\n", - " \"Other\"\n", - " ]\n", - " response_counts = response_counts.reindex(response_order, fill_value=0)\n", - "\n", - " # Plot the bar graph\n", - " plt.figure(figsize=(12, 8))\n", - " response_counts.plot(kind='bar', color='green')\n", - "\n", - " # Add titles and labels\n", - " plt.title(\"Responses to the Importance of Open Source ML/AI Frameworks\", fontsize=14, fontweight=\"bold\")\n", - " plt.xlabel(\"Response\", fontsize=12)\n", - " plt.ylabel(\"Number of Respondents\", fontsize=12)\n", - " plt.xticks(rotation=45, ha='right')\n", - "\n", - " # Annotate the bars\n", - " for i, count in enumerate(response_counts):\n", - " plt.text(i, count + 0.5, str(count), ha='center', fontsize=10)\n", - "\n", - " # Show the plot\n", - " plt.tight_layout()\n", - " plt.show()\n", - "else:\n", - " print(f\"Column '{question_column}' not found in the dataset.\")" + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['For model registry and experiment tracking, do you use any of the following tools? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Tools', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Tools:N', sort='-x'),\n", + " tooltip=['Tools', 'Count']\n", + ").properties(\n", + " title='Tools used for model registry and experiment tracking',\n", + " width=500,\n", + " height=300\n", + ")\n", + "\n", + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Tools used for model registry and experiment tracking](images/chart16.png)\n", + "![Tools used for model registry and experiment tracking](images/chart8.png)\n", "***" ] }, @@ -800,52 +745,81 @@ "source": [ "# Tools used for scheduling\n", "\n", - "```{dropdown} Show code\n", + "For scheduling, the majority of respondants have chosen Airflow. Also, there are other tools that were not listed in the survey that are popular among the users. Coiled is the least popular of the listed options.\n", + "\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", + "# Mapping dictionary\n", + "mapping_dict = {\n", + " # Airflow responses\n", + " \"Airflow\": \"Airflow\",\n", + " \"Airflow اير فلو\": \"Airflow\",\n", + " # Argo responses\n", + " \"Argo\": \"Argo\",\n", + " \"Argo ارجو\": \"Argo\",\n", + " # Coiled responses\n", + " \"Coiled\": \"Coiled\",\n", + " \"Coiled كويلد\": \"Coiled\",\n", + " # Dagster responses\n", + " \"Dagster\": \"Dagster\",\n", + " \"Dagster داجستر\": \"Dagster\",\n", + " # Kubeflow responses\n", + " \"Kubeflow\": \"Kubeflow\",\n", + " \"Kubeflow كيوب فلو\": \"Kubeflow\",\n", + " # Metaflow (outerbounds) responses\n", + " \"Metaflow (outerbounds)\": \"Metaflow (outerbounds)\",\n", + " \"Metaflow (outerbounds)(اوت باندز) ميتا فلو\": \"Metaflow (outerbounds)\",\n", + " # Custom tool responses\n", + " \"Custom tool\": \"Custom tool\",\n", + " \"Ferramenta personalizada\": \"Custom tool\",\n", + " \"Herramientas personalizadas\": \"Custom tool\",\n", + " \"自定义工具\": \"Custom tool\",\n", + " \"Outil personnalisé\": \"Custom tool\",\n", + " \"أداة مخصصة\": \"Custom tool\",\n", + " # Other responses\n", + " \"Other\": \"Other\",\n", + " \"Outro\": \"Other\",\n", + " \"Otro\": \"Other\",\n", + " \"其它\": \"Other\",\n", + " \"Autre\": \"Other\",\n", + " \"أخرى\": \"Other\"\n", + "}\n", + "\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['For scheduling, do you use any of the following tools? Select all that apply.'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Tools', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Tools:N', sort='-x'),\n", + " tooltip=['Tools', 'Count']\n", + ").properties(\n", + " title='Tools used for scheduling',\n", + " width=500,\n", + " height=300\n", + ")\n", "\n", - "# Define the relevant question\n", - "question_column = \"For scheduling, do you use any of the following tools? Select all that apply.\"\n", - "\n", - "# Check if the column exists in the dataset\n", - "if question_column in data.columns:\n", - " # Count occurrences of each response\n", - " response_counts = data[question_column].value_counts()\n", - "\n", - " # Sort responses in the desired order\n", - " response_order = [\n", - " \"Airflow\",\n", - " \"Argo\",\n", - " \"Coiled\",\n", - " \"Dagster\",\n", - " \"Kubeflow\",\n", - " \"Metaflow (outerbounds)\",\n", - " \"Custom tool\",\n", - " \"Other\"\n", - " ]\n", - " response_counts = response_counts.reindex(response_order, fill_value=0)\n", - "\n", - " # Plot the pie chart\n", - " plt.figure(figsize=(15, 15))\n", - " plt.pie(response_counts, labels=response_counts.index, autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)\n", - "\n", - " # Add title\n", - " plt.title(\"Responses to the Importance of Open Source ML/AI Features\", fontsize=14, fontweight=\"bold\")\n", - "\n", - " # Show the plot\n", - " plt.axis('equal') # Equal aspect ratio ensures that pie chart is drawn as a circle.\n", - " plt.tight_layout()\n", - " plt.show()\n", - "\n", - "else:\n", - " print(f\"Column '{question_column}' not found in the dataset.\")" + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Tools used for scheduling](images/chart17.png)\n", + "![Tools used for scheduling](images/chart9.png)\n", "***" ] }, @@ -855,179 +829,89 @@ "source": [ "# Time that a typical model training takes in ML projects\n", "\n", - "```{dropdown} Show code\n", + "Respondents usually take from some minutes to a day for a typical model training. It is not very common that they take less than a minute or more than a day.\n", "\n", - "```python\n", - "\n", - "# Define the relevant question\n", - "question_column = \"How long does a typical model training take in your ML projects?\"\n", - "\n", - "# Check if the column exists in the dataset\n", - "if question_column in data.columns:\n", - " # Count occurrences of each response\n", - " response_counts = data[question_column].value_counts()\n", - "\n", - " # Sort responses in the desired order\n", - " response_order = [\n", - " \"less than 10 seconds\",\n", - " \"less than a minute\",\n", - " \"less than 10 minutes\",\n", - " \"less than an hour\",\n", - " \"less than a day\",\n", - " \"more than a day\",\n", - " ]\n", - " response_counts = response_counts.reindex(response_order, fill_value=0)\n", - "\n", - " # Plot the bar graph\n", - " plt.figure(figsize=(12, 8))\n", - " response_counts.plot(kind='bar', color='blue')\n", - "\n", - " # Add titles and labels\n", - " plt.title(\"Responses to the Importance of Open Source ML/AI Frameworks\", fontsize=14, fontweight=\"bold\")\n", - " plt.xlabel(\"Response\", fontsize=12)\n", - " plt.ylabel(\"Number of Respondents\", fontsize=12)\n", - " plt.xticks(rotation=45, ha='right')\n", - "\n", - " # Annotate the bars\n", - " for i, count in enumerate(response_counts):\n", - " plt.text(i, count + 0.5, str(count), ha='center', fontsize=10)\n", - "\n", - " # Show the plot\n", - " plt.tight_layout()\n", - " plt.show()\n", - "else:\n", - " print(f\"Column '{question_column}' not found in the dataset.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Time that a typical model training takes in ML projects](images/chart18.png)\n", - "***" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Number of deployed models currently mantaining\n", - "\n", - "```{dropdown} Show code\n", + "```{tip} Show code\n", + ":class: dropdown \n", "\n", "```python\n", + "# Mapping dictionary\n", + "mapping_dict = {\n", + " # less than 10 seconds responses\n", + " \"less than 10 seconds\": \"less than 10 seconds\",\n", + " \"menos de 10 segundos\": \"less than 10 seconds\",\n", + " \"少于10秒\": \"less than 10 seconds\",\n", + " \"moins de 10 secondes\": \"less than 10 seconds\",\n", + " \"أقل من ١٠ ثوانٍ\": \"less than 10 seconds\",\n", + " # less than a minute responses\n", + " \"less than a minute\": \"less than a minute\",\n", + " \"menos de um minuto\": \"less than a minute\",\n", + " \"menos de un minuto\": \"less than a minute\",\n", + " \"少于1分钟\": \"less than a minute\",\n", + " \"moins d'une minute\": \"less than a minute\",\n", + " \"أقل من دقيقة\": \"less than a minute\",\n", + " # less than 10 minutes responses\n", + " \"less than 10 minutes\": \"less than 10 minutes\",\n", + " \"menos de 10 minutos\": \"less than 10 minutes\",\n", + " \"少于10分钟\": \"less than 10 minutes\",\n", + " \"moins de 10 minutes\": \"less than 10 minutes\",\n", + " \"أقل من ١٠ دقائق\": \"less than 10 minutes\",\n", + " # less than an hour responses\n", + " \"less than an hour\": \"less than an hour\",\n", + " \"menos de uma hora\": \"less than an hour\",\n", + " \"menos de una hora\": \"less than an hour\",\n", + " \"少于1小时\": \"less than an hour\",\n", + " \"moins d'une heure\": \"less than an hour\",\n", + " \"أقل من ساعة\": \"less than an hour\",\n", + " # less than a day responses\n", + " \"less than a day\": \"less than a day\",\n", + " \"menos de um dia\": \"less than a day\",\n", + " \"menos de un día\": \"less than a day\",\n", + " \"少于1天\": \"less than a day\",\n", + " \"moins d'une journée\": \"less than a day\",\n", + " \"أقل من يوم\": \"less than a day\",\n", + " # more than a day responses\n", + " \"more than a day\": \"more than a day\",\n", + " \"mais de um dia\": \"more than a day\",\n", + " \"más de un día\": \"more than a day\",\n", + " \"多于1天\": \"more than a day\",\n", + " \"plus d'une journée\": \"more than a day\",\n", + " \"أكثر من يوم\": \"more than a day\"\n", + "}\n", "\n", - "#print(df.head())\n", - "\n", - "# Identify the columns related to the question about priorities for Deployed Models\n", - "deployed_models_columns = [\n", - " col for col in df.columns\n", - " if \"How many deployed models are you (and your team) currently maintaining?\" in col\n", - "]\n", - "\n", - "# print(\"Identified columns for Deployed Models:\\n\",deploy_columns)\n", - "\n", - "# Filter relevant data\n", - "deployed_data = df[deployed_models_columns].dropna()\n", - "\n", - "# Rename columns\n", - "renamed_deployed_columns = [\n", - " \"\"\n", - "]\n", - "deployed_data.columns = renamed_deployed_columns\n", - "\n", - "# Stacked bar chart\n", - "# Reindex with available priority levels (1 to 5 and more than 5)\n", - "priority_levels = [1, 2, 3, 4, 5, float('inf')] # [1, 2, 3, 4, 5, >5]\n", - "\n", - "\n", - "# Prepare data for the stacked bar chart\n", - "stacked_bar_data = pd.DataFrame({\n", - " category: deployed_data[category].value_counts().reindex(priority_levels, fill_value=0)\n", - " for category in deployed_data.columns\n", - "}).T\n", - "\n", - "# Print to verify processed data\n", - "#print(\"\\nPrepared data for the stacked bar chart:\")\n", - "#print(stacked_bar_data)\n", - "\n", - "# Create the stacked bar chart\n", - "stacked_bar_data.plot(\n", - " kind=\"bar\", stacked=True, figsize=(12, 6), colormap=\"viridis\", edgecolor=\"none\"\n", - ")\n", - "plt.title(\"Deployment: Priority Levels\", fontsize=14, fontweight='bold')\n", - "plt.xlabel(\"\")\n", - "plt.ylabel(\"Number of Responses\")\n", - "plt.legend(\n", - " title=\"Priority Level\", bbox_to_anchor=(1.05, 1), loc=\"upper left\", fontsize=10\n", + "# Function to normalize responses\n", + "def normalize_responses(response):\n", + " if isinstance(response, str):\n", + " response_split = [r.strip() for r in response.split(',')]\n", + " normalized = [mapping_dict.get(r, None) for r in response_split]\n", + " return [r for r in normalized if r is not None]\n", + " return []\n", + "\n", + "# Apply normalization and count responses\n", + "df['Normalized_Responses'] = df['How long does a typical model training take in your ML projects?'].apply(normalize_responses)\n", + "all_responses = [item for sublist in df['Normalized_Responses'].dropna() for item in sublist]\n", + "response_counts = pd.Series(all_responses).value_counts().reset_index()\n", + "response_counts.columns = ['Time', 'Count']\n", + "\n", + "# Chart using Altair with orange color\n", + "chart = alt.Chart(response_counts).mark_bar(color='#F7931E').encode(\n", + " x='Count:Q',\n", + " y=alt.Y('Time:N', sort='-x'),\n", + " tooltip=['Time', 'Count']\n", + ").properties(\n", + " title='Time taken for a typical model training',\n", + " width=500,\n", + " height=300\n", ")\n", - "plt.xticks(rotation=45, ha='right')\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(image here)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Open source ML & AI frameworks and libraries are crucial for transparency and reproducibility\n", - "\n", - "```{dropdown} Show code\n", - "\n", - "```python\n", "\n", - "# Define the relevant question\n", - "question_column = \"To what extent do you agree with the following statement?\\nOpen source ML & AI frameworks and libraries are crucial for ensuring transparency and the reproducibility of AI research and development.\"\n", - "\n", - "# Check if the column exists in the dataset\n", - "if question_column in data.columns:\n", - " # Count occurrences of each response\n", - " response_counts = data[question_column].value_counts()\n", - "\n", - " # Sort responses in the desired order\n", - " response_order = [\n", - " \"Strongly agree\",\n", - " \"Agree\",\n", - " \"Neither agree nor disagree\",\n", - " \"Disagree\",\n", - " \"Strongly disagree\"\n", - " ]\n", - " response_counts = response_counts.reindex(response_order, fill_value=0)\n", - "\n", - " # Plot the bar graph\n", - " plt.figure(figsize=(10, 6))\n", - " response_counts.plot(kind='bar', color='red')\n", - "\n", - " # Add titles and labels\n", - " plt.title(\"Responses to the Importance of Open Source ML/AI Frameworks\", fontsize=14, fontweight=\"bold\")\n", - " plt.xlabel(\"Response\", fontsize=12)\n", - " plt.ylabel(\"Number of Respondents\", fontsize=12)\n", - " plt.xticks(rotation=45, ha='right')\n", - "\n", - " # Annotate the bars\n", - " for i, count in enumerate(response_counts):\n", - " plt.text(i, count + 0.5, str(count), ha='center', fontsize=10)\n", - "\n", - " # Show the plot\n", - " plt.tight_layout()\n", - " plt.show()\n", - "else:\n", - " print(f\"Column '{question_column}' not found in the dataset.\")" + "chart.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Open source ML & AI frameworks and libraries are crucial for transparency and reproducibility](images/chart20.png)\n", + "![Time that a typical model training takes in ML projects](images/chart10.png)\n", "***" ] }