added comments and fixed bugs

CMU-17313Q · Nov 13, 2023 · 7117dc9 · 7117dc9
1 parent e2d3706
commit 7117dc9
Showing 1 changed file with 32 additions and 50 deletions.
diff --git a/career-model/JupyterNotebook.ipynb b/career-model/JupyterNotebook.ipynb
@@ -452,35 +452,32 @@
    "source": [
     "# Count of unique categories\n",
     "gender_counts = df['Gender'].value_counts()\n",
+    "print(\"Gender Counts:\")\n",
+    "print(gender_counts)\n",
     "major_counts = df['Major'].value_counts()\n",
+    "print(\"\\nMajor Counts:\")\n",
+    "print(major_counts)\n",
     "extra_curricular_counts = df['Extra Curricular'].value_counts()\n",
+    "print(\"\\nExtra Curricular Counts:\")\n",
+    "print(extra_curricular_counts)\n",
     "\n",
     "# Unique categories\n",
     "unique_genders = df['Gender'].unique()\n",
+    "print(\"Unique Genders:\")\n",
+    "print(unique_genders)\n",
     "unique_majors = df['Major'].unique()\n",
+    "print(\"Unique Majors:\")\n",
+    "print(unique_majors)\n",
     "unique_extra_curricular = df['Extra Curricular'].unique()\n",
+    "print(\"Unique Extra Curricular Activities:\")\n",
+    "print(unique_extra_curricular)\n",
     "\n",
     "# Mode\n",
     "mode_gender = df['Gender'].mode().values[0]\n",
-    "mode_major = df['Major'].mode().values[0]\n",
-    "mode_extra_curricular = df['Extra Curricular'].mode().values[0]\n",
-    "\n",
-    "print(\"Gender Counts:\")\n",
-    "print(gender_counts)\n",
-    "print(\"Unique Genders:\")\n",
-    "print(unique_genders)\n",
     "print(\"Mode Gender:\", mode_gender)\n",
-    "\n",
-    "print(\"\\nMajor Counts:\")\n",
-    "print(major_counts)\n",
-    "print(\"Unique Majors:\")\n",
-    "print(unique_majors)\n",
+    "mode_major = df['Major'].mode().values[0]\n",
     "print(\"Mode Major:\", mode_major)\n",
-    "\n",
-    "print(\"\\nExtra Curricular Counts:\")\n",
-    "print(extra_curricular_counts)\n",
-    "print(\"Unique Extra Curricular Activities:\")\n",
-    "print(unique_extra_curricular)\n",
+    "mode_extra_curricular = df['Extra Curricular'].mode().values[0]\n",
     "print(\"Mode Extra Curricular:\", mode_extra_curricular)\n"
    ]
   },
@@ -577,7 +574,6 @@
     }
    ],
    "source": [
-    "\n",
     "sns.countplot(data = df, y = 'Major', order = df['Major'].value_counts().index, hue = 'Major')\n",
     "plt.title(\"Distribution of Major\")"
    ]
@@ -612,7 +608,6 @@
    ],
    "source": [
     "# Age\n",
-    "#df.groupby('Age').size().plot(kind='bar', title='Distribution of Age', ylabel='No. of Students')\n",
     "sns.histplot(data=df, x=\"Age\", kde=True, bins = 10)"
    ]
   },
@@ -646,7 +641,6 @@
    ],
    "source": [
     "# GPA\n",
-    "\n",
     "sns.histplot(data=df, x=\"GPA\", kde=True).set(title='Distribution of GPA')"
    ]
   },
@@ -682,7 +676,6 @@
     "# Extra Curricular\n",
     "color = ['black','red','green','orange','blue','limegreen','darkgreen','royalblue','navy','red','pink','orange']\n",
     "\n",
-    "\n",
     "sns.countplot(data = df, y = 'Extra Curricular', order = df['Extra Curricular'].value_counts().index, hue='Extra Curricular')\n",
     "plt.title(\"Distribution of Extra-Curriculars\")"
    ]
@@ -1097,7 +1090,7 @@
    "source": [
     "from sklearn import metrics\n",
     "confusion_matrix = metrics.confusion_matrix(y, y_pred)\n",
-    "print(\"The confusion matrix is\")\n",
+    "print(\"Confusion Matrix: \")\n",
     "confusion_matrix"
    ]
   },
@@ -1172,30 +1165,24 @@
     }
    ],
    "source": [
-    "from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report\n",
     "\n",
-    "# Assuming you have already trained your model (replace 'model' with your model)\n",
     "# Make predictions on the test dataset\n",
     "y_pred = clf.predict(X)\n",
     "\n",
     "# Calculate accuracy\n",
     "accuracy = accuracy_score(y_true, y_pred)\n",
     "print(f\"Accuracy: {accuracy:.2f}\")\n",
     "\n",
-    "# Calculate precision and recall, and F1-score\n",
+    "# Calculate precision and recall\n",
     "precision = precision_score(y_true, y_pred)\n",
     "recall = recall_score(y_true, y_pred)\n",
     "print(f\"Precision: {precision:.2f}\")\n",
     "print(f\"Recall: {recall:.2f}\")\n",
     "\n",
     "# Generate a classification report (includes precision, recall, and F1-score)\n",
-    "print(\"Classification Report:\")\n",
-    "print(classification_report(y_true, y_pred))\n",
-    "\n",
-    "# Create and display a confusion matrix\n",
-    "confusion = confusion_matrix(y_true, y_pred)\n",
-    "print(\"Confusion Matrix:\")\n",
-    "print(confusion)\n"
+    "print(\"Classification Report: \")\n",
+    "print(classification_report(y_true, y_pred))\n"
    ]
   },
   {
@@ -1408,6 +1395,8 @@
     }
    ],
    "source": [
+    "import json\n",
+    "\n",
     "columns = ['Major', 'Age', 'Gender', 'Extra Curricular', 'Num Programming Languages', 'Num Past Internships']\n",
     "distribution_counts = dict()\n",
     "for index, row in df.iterrows():\n",
@@ -1419,7 +1408,6 @@
     "      distribution_counts[col][row[col]] = 0\n",
     "    distribution_counts[col][row[col]] += 1\n",
     "\n",
-    "import json\n",
     "print(json.dumps(distribution_counts, sort_keys=True, indent=4))"
    ]
   },
@@ -1436,6 +1424,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# creds to Spring '23 team\n",
+    "\n",
     "import pandas as pd\n",
     "import joblib\n",
     "from pydantic import BaseModel, Field\n",
@@ -1608,7 +1598,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Keep track of outputs, predicted vs actual for accuracy metrics to be done later\n",
     "predicted_women = []\n",
     "actual_women = []\n",
     "\n",
@@ -1814,6 +1803,7 @@
    ],
    "source": [
     "# True positive rates for men and women\n",
+    "\n",
     "from sklearn.metrics import confusion_matrix, recall_score\n",
     "\n",
     "# Men\n",
@@ -1830,9 +1820,9 @@
     "FN_women = conf_matrix_women[1, 0]\n",
     "TPR_women = TP_women / (TP_women + FN_women)\n",
     "\n",
-    "# Print the TPR for men\n",
+    "#######\n",
+    "\n",
     "print(f\"True Positive Rate (Recall) for men: {TPR_men:.2f}\")\n",
-    "# Print the TPR for women\n",
     "print(f\"True Positive Rate (Recall) for women: {TPR_women:.2f}\")"
    ]
   },
@@ -1910,7 +1900,7 @@
     "total_men = d[0] + d[1]\n",
     "prediction_men = positive_men/total_men\n",
     "# number of predicted good candidates for men\n",
-    "print(f\"Good candidates for men (predicted):  {prediction_men}\")\n",
+    "print(f\"Good candidates for men (predicted): {prediction_men}\")\n",
     "\n",
     "\n",
     "numpy_actual_men = numpy.array(actual_men)\n",
@@ -1920,7 +1910,7 @@
     "total_men1 = d[0] + d[1]\n",
     "actual_men = positive_men1/total_men1\n",
     "# number of actual good candidates for men\n",
-    "print(f\"Good candidates for men (actual):  {actual_men}\")\n",
+    "print(f\"Good candidates for men (actual): {actual_men}\")\n",
     "\n",
     "\n",
     "numpy_predicted_women = numpy.array(predicted_women)\n",
@@ -1930,7 +1920,7 @@
     "total_women = d[0] + d[1]\n",
     "prediction_women = positive_women/total_women\n",
     "# number of predicted good candidates for women\n",
-    "print(f\"Good candidates for women (predicted):  {prediction_women}\")\n",
+    "print(f\"Good candidates for women (predicted): {prediction_women}\")\n",
     "\n",
     "\n",
     "numpy_actual_women = numpy.array(actual_women)\n",
@@ -1940,7 +1930,7 @@
     "total_women1 = d[0] + d[1]\n",
     "actual_women = positive_women1/total_women1\n",
     "# number of actual good candidates for women\n",
-    "print(f\"Good candidates for women (actual):  {actual_women}\")"
+    "print(f\"Good candidates for women (actual): {actual_women}\")"
    ]
   },
   {
@@ -1998,7 +1988,6 @@
     "TNR_actual_men = 1.0 - TPR_actual_men\n",
     "TNR_actual_women = 1.0 - TPR_actual_women\n",
     "\n",
-    "\n",
     "print(\"Disparate Impact Predicted:\", DI_predicted)\n",
     "print(\"Disparate Impact Actual:\", DI_actual)\n",
     "\n",
@@ -2011,18 +2000,11 @@
     "print(\"Equalized Odds Difference Predicted:\", EOD_positive_predicted)\n",
     "print(\"Equalized Odds Difference Actual:\", EOD_positive_actual)\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2036,7 +2018,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,