Merge pull request #52 from CMU-17313Q/jupyter-notebook

Created jupyter notebook
CMU-17313Q · Nov 4, 2023 · 0ba2674 · 0ba2674
2 parents e5e8bc3 + 66376b6
commit 0ba2674
Show file tree

Hide file tree

Showing 8 changed files with 1,727 additions and 0 deletions.
diff --git a/.eslintignore b/.eslintignore
@@ -30,3 +30,4 @@ test/files
 themes/
 
 report/
+career-model/
diff --git a/.gitignore b/.gitignore
@@ -82,3 +82,7 @@ theme/*.sublime-workspace
 theme/.idea
 theme/.vscode
 theme/node_modules/
+
+# Career Model: Python Ignores
+__pycache__
+.ipynb_checkpoints
diff --git a/career-model/JupyterNotebook.ipynb b/career-model/JupyterNotebook.ipynb
diff --git a/career-model/README.md b/career-model/README.md
@@ -0,0 +1,57 @@
+# Career Recruiter ML Model Framework
+
+## Overview
+
+This folder contains an ML model for predicting whether a student applicant would be a good employee, along with some basic starter code for how to interact with the model.
+
+This model should eventually be connected with the career page within NodeBB to allow recruiters to view a prediction of a student applicant's likeliness to be a good employee to hire.
+
+## Setup
+
+1. (Optional) Set up a [virtual environment](https://docs.python.org/3/library/venv.html) for Python
+2. Run `pip install -r requirements.txt` to install all dependencies
+
+## Running the Model
+
+The file `predict.py` contains a function `predict` which, given a student application input, returns a prediction whether the student would be a good employee.
+
+Below is a sample run from the terminal:
+
+```
+% python3
+>>> from predict import predict
+>>> student = {
+        "student_id": "student1",
+        "major": "Computer Science",
+        "age": "20",
+        "gender": "M",
+        "gpa": "4.0",
+        "extra_curricular": "Men's Basketball",
+        "num_programming_languages": "1",
+        "num_past_internships": "2"
+    }
+>>> predict(student)
+{'good_employee': 1}
+```
+
+## Function Inputs
+
+The `predict` function takes in a student info dictionary that contains the following fields (note that all fields are taken as a `string` value and parsed by the model itself):
+
+- `student_id`: unique identifier for the student
+- `major`: major of the student
+  - Computer Science, Information Systems, Business, Math, Electrical and Computer Engineering, Statistics and Machine Learning
+- `age`: age of the student, [18, 25]
+- `gender`: gender of the student, M(ale)/F(emale)/O(ther)
+- `gpa`: gpa of the student, [0.0, 4.0]
+- `extra_curricular`: the most important extracurricular activity to the student
+  - Student Theatre, Buggy, Teaching Assistant, Student Government, Society of Women Engineers, Women in CS, Volleyball, Sorority, Men's Basketball, American Football, Men's Golf, Fraternity
+- `num_programming_languages`: number of programming languages that the student is familiar with, [1, 5]
+- `num_past_internships`: number of previous internships that the student has had, [0, 4]
+
+## Function Outputs
+
+The `predict` function returns a prediction result dictionary containing the following:
+
+- `good_employee`: numpy.int64, 1 if the student is predicted to be a good employee, 0 otherwise.
+  - **Dev Note:** If needed, this value is castable to an int via `.item()`
diff --git a/career-model/model.pkl b/career-model/model.pkl
diff --git a/career-model/predict.py b/career-model/predict.py
@@ -0,0 +1,51 @@
+import pandas as pd
+import joblib
+from pydantic import BaseModel, Field
+from pydantic.tools import parse_obj_as
+
+# Pydantic Models
+class Student(BaseModel):
+    student_id: str = Field(alias="Student ID")
+    gender: str = Field(alias="Gender")
+    age: str = Field(alias="Age")
+    major: str = Field(alias="Major")
+    gpa: str = Field(alias="GPA")
+    extra_curricular: str = Field(alias="Extra Curricular")
+    num_programming_languages: str = Field(alias="Num Programming Languages")
+    num_past_internships: str = Field(alias="Num Past Internships")
+
+    class Config:
+        allow_population_by_field_name = True
+
+class PredictionResult(BaseModel):
+    good_employee: int
+
+
+# Main Functionality
+def predict(student):
+    '''
+    Returns a prediction on whether the student will be a good employee
+    based on given parameters by using the ML model
+
+    Parameters
+    ----------
+    student : dict
+        A dictionary that contains all fields in Student
+    
+    Returns
+    -------
+    dict
+        A dictionary satisfying type PredictionResult, contains a single field
+        'good_employee' which is either 1 (will be a good employee) or 0 (will
+        not be a good employee)
+    '''
+    # Use Pydantic to validate model fields exist
+    student = parse_obj_as(Student, student)
+
+    clf = joblib.load('./model.pkl')
+
+    student = student.dict(by_alias=True)
+    query = pd.DataFrame(student, index=[0])
+    prediction = clf.predict(query) # TODO: Error handling ??
+
+    return { 'good_employee': prediction[0] }
diff --git a/career-model/requirements.txt b/career-model/requirements.txt
@@ -0,0 +1,11 @@
+joblib==1.2.0
+numpy==1.24.2
+pandas==1.5.3
+pydantic==1.10.6
+python-dateutil==2.8.2
+pytz==2022.7.1
+scikit-learn==1.2.1
+scipy==1.10.1
+six==1.16.0
+threadpoolctl==3.1.0
+typing_extensions==4.5.0