diff --git a/career-model/JupyterNotebook.ipynb b/career-model/JupyterNotebook.ipynb index db2cf10..910b407 100644 --- a/career-model/JupyterNotebook.ipynb +++ b/career-model/JupyterNotebook.ipynb @@ -452,35 +452,32 @@ "source": [ "# Count of unique categories\n", "gender_counts = df['Gender'].value_counts()\n", + "print(\"Gender Counts:\")\n", + "print(gender_counts)\n", "major_counts = df['Major'].value_counts()\n", + "print(\"\\nMajor Counts:\")\n", + "print(major_counts)\n", "extra_curricular_counts = df['Extra Curricular'].value_counts()\n", + "print(\"\\nExtra Curricular Counts:\")\n", + "print(extra_curricular_counts)\n", "\n", "# Unique categories\n", "unique_genders = df['Gender'].unique()\n", + "print(\"Unique Genders:\")\n", + "print(unique_genders)\n", "unique_majors = df['Major'].unique()\n", + "print(\"Unique Majors:\")\n", + "print(unique_majors)\n", "unique_extra_curricular = df['Extra Curricular'].unique()\n", + "print(\"Unique Extra Curricular Activities:\")\n", + "print(unique_extra_curricular)\n", "\n", "# Mode\n", "mode_gender = df['Gender'].mode().values[0]\n", - "mode_major = df['Major'].mode().values[0]\n", - "mode_extra_curricular = df['Extra Curricular'].mode().values[0]\n", - "\n", - "print(\"Gender Counts:\")\n", - "print(gender_counts)\n", - "print(\"Unique Genders:\")\n", - "print(unique_genders)\n", "print(\"Mode Gender:\", mode_gender)\n", - "\n", - "print(\"\\nMajor Counts:\")\n", - "print(major_counts)\n", - "print(\"Unique Majors:\")\n", - "print(unique_majors)\n", + "mode_major = df['Major'].mode().values[0]\n", "print(\"Mode Major:\", mode_major)\n", - "\n", - "print(\"\\nExtra Curricular Counts:\")\n", - "print(extra_curricular_counts)\n", - "print(\"Unique Extra Curricular Activities:\")\n", - "print(unique_extra_curricular)\n", + "mode_extra_curricular = df['Extra Curricular'].mode().values[0]\n", "print(\"Mode Extra Curricular:\", mode_extra_curricular)\n" ] }, @@ -577,7 +574,6 @@ } ], "source": [ - "\n", "sns.countplot(data = df, y = 'Major', order = df['Major'].value_counts().index, hue = 'Major')\n", "plt.title(\"Distribution of Major\")" ] @@ -612,7 +608,6 @@ ], "source": [ "# Age\n", - "#df.groupby('Age').size().plot(kind='bar', title='Distribution of Age', ylabel='No. of Students')\n", "sns.histplot(data=df, x=\"Age\", kde=True, bins = 10)" ] }, @@ -646,7 +641,6 @@ ], "source": [ "# GPA\n", - "\n", "sns.histplot(data=df, x=\"GPA\", kde=True).set(title='Distribution of GPA')" ] }, @@ -682,7 +676,6 @@ "# Extra Curricular\n", "color = ['black','red','green','orange','blue','limegreen','darkgreen','royalblue','navy','red','pink','orange']\n", "\n", - "\n", "sns.countplot(data = df, y = 'Extra Curricular', order = df['Extra Curricular'].value_counts().index, hue='Extra Curricular')\n", "plt.title(\"Distribution of Extra-Curriculars\")" ] @@ -1097,7 +1090,7 @@ "source": [ "from sklearn import metrics\n", "confusion_matrix = metrics.confusion_matrix(y, y_pred)\n", - "print(\"The confusion matrix is\")\n", + "print(\"Confusion Matrix: \")\n", "confusion_matrix" ] }, @@ -1172,9 +1165,8 @@ } ], "source": [ - "from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report\n", "\n", - "# Assuming you have already trained your model (replace 'model' with your model)\n", "# Make predictions on the test dataset\n", "y_pred = clf.predict(X)\n", "\n", @@ -1182,20 +1174,15 @@ "accuracy = accuracy_score(y_true, y_pred)\n", "print(f\"Accuracy: {accuracy:.2f}\")\n", "\n", - "# Calculate precision and recall, and F1-score\n", + "# Calculate precision and recall\n", "precision = precision_score(y_true, y_pred)\n", "recall = recall_score(y_true, y_pred)\n", "print(f\"Precision: {precision:.2f}\")\n", "print(f\"Recall: {recall:.2f}\")\n", "\n", "# Generate a classification report (includes precision, recall, and F1-score)\n", - "print(\"Classification Report:\")\n", - "print(classification_report(y_true, y_pred))\n", - "\n", - "# Create and display a confusion matrix\n", - "confusion = confusion_matrix(y_true, y_pred)\n", - "print(\"Confusion Matrix:\")\n", - "print(confusion)\n" + "print(\"Classification Report: \")\n", + "print(classification_report(y_true, y_pred))\n" ] }, { @@ -1408,6 +1395,8 @@ } ], "source": [ + "import json\n", + "\n", "columns = ['Major', 'Age', 'Gender', 'Extra Curricular', 'Num Programming Languages', 'Num Past Internships']\n", "distribution_counts = dict()\n", "for index, row in df.iterrows():\n", @@ -1419,7 +1408,6 @@ " distribution_counts[col][row[col]] = 0\n", " distribution_counts[col][row[col]] += 1\n", "\n", - "import json\n", "print(json.dumps(distribution_counts, sort_keys=True, indent=4))" ] }, @@ -1436,6 +1424,8 @@ "metadata": {}, "outputs": [], "source": [ + "# creds to Spring '23 team\n", + "\n", "import pandas as pd\n", "import joblib\n", "from pydantic import BaseModel, Field\n", @@ -1608,7 +1598,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Keep track of outputs, predicted vs actual for accuracy metrics to be done later\n", "predicted_women = []\n", "actual_women = []\n", "\n", @@ -1814,6 +1803,7 @@ ], "source": [ "# True positive rates for men and women\n", + "\n", "from sklearn.metrics import confusion_matrix, recall_score\n", "\n", "# Men\n", @@ -1830,9 +1820,9 @@ "FN_women = conf_matrix_women[1, 0]\n", "TPR_women = TP_women / (TP_women + FN_women)\n", "\n", - "# Print the TPR for men\n", + "#######\n", + "\n", "print(f\"True Positive Rate (Recall) for men: {TPR_men:.2f}\")\n", - "# Print the TPR for women\n", "print(f\"True Positive Rate (Recall) for women: {TPR_women:.2f}\")" ] }, @@ -1910,7 +1900,7 @@ "total_men = d[0] + d[1]\n", "prediction_men = positive_men/total_men\n", "# number of predicted good candidates for men\n", - "print(f\"Good candidates for men (predicted): {prediction_men}\")\n", + "print(f\"Good candidates for men (predicted): {prediction_men}\")\n", "\n", "\n", "numpy_actual_men = numpy.array(actual_men)\n", @@ -1920,7 +1910,7 @@ "total_men1 = d[0] + d[1]\n", "actual_men = positive_men1/total_men1\n", "# number of actual good candidates for men\n", - "print(f\"Good candidates for men (actual): {actual_men}\")\n", + "print(f\"Good candidates for men (actual): {actual_men}\")\n", "\n", "\n", "numpy_predicted_women = numpy.array(predicted_women)\n", @@ -1930,7 +1920,7 @@ "total_women = d[0] + d[1]\n", "prediction_women = positive_women/total_women\n", "# number of predicted good candidates for women\n", - "print(f\"Good candidates for women (predicted): {prediction_women}\")\n", + "print(f\"Good candidates for women (predicted): {prediction_women}\")\n", "\n", "\n", "numpy_actual_women = numpy.array(actual_women)\n", @@ -1940,7 +1930,7 @@ "total_women1 = d[0] + d[1]\n", "actual_women = positive_women1/total_women1\n", "# number of actual good candidates for women\n", - "print(f\"Good candidates for women (actual): {actual_women}\")" + "print(f\"Good candidates for women (actual): {actual_women}\")" ] }, { @@ -1998,7 +1988,6 @@ "TNR_actual_men = 1.0 - TPR_actual_men\n", "TNR_actual_women = 1.0 - TPR_actual_women\n", "\n", - "\n", "print(\"Disparate Impact Predicted:\", DI_predicted)\n", "print(\"Disparate Impact Actual:\", DI_actual)\n", "\n", @@ -2011,18 +2000,11 @@ "print(\"Equalized Odds Difference Predicted:\", EOD_positive_predicted)\n", "print(\"Equalized Odds Difference Actual:\", EOD_positive_actual)\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -2036,7 +2018,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.8" } }, "nbformat": 4,