Merge branch 'andrey/gsk96-refactoring'

Giskard-AI · Jun 9, 2022 · 54b504b · 54b504b
2 parents 42fd7ba + 7e5af08
commit 54b504b
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 63 deletions.
diff --git a/giskard/giskard_client.py b/giskard/giskard_client.py
@@ -1,4 +1,5 @@
 """API Client to interact with the Giskard app"""
+import warnings
 from typing import List
 from urllib.parse import urljoin
 
@@ -9,7 +10,11 @@
 
 
 class GiskardError(Exception):
-    pass
+
+    def __init__(self, message: str, status: int, code: str) -> None:
+        super().__init__(message)
+        self.status = status
+        self.code = code
 
 
 class ErrorHandlingAdapter(HTTPAdapter):
@@ -21,7 +26,9 @@ def build_response(self, req, resp):
             try:
                 err_resp = response.json()
                 giskard_error = GiskardError(
-                    f"{err_resp.get('title', 'Unknown error')}: {err_resp.get('detail', 'no details')}")
+                    status=err_resp.get('status'),
+                    code=err_resp.get('message'),
+                    message=f"{err_resp.get('title', 'Unknown error')}: {err_resp.get('detail', 'no details')}")
             except:  # NOSONAR
                 response.raise_for_status()
             raise giskard_error
@@ -48,11 +55,17 @@ def get_project(self, project_key: str):
         return GiskardProject(self._session, response['key'])
 
     def create_project(self, project_key: str, name: str, description: str = None):
-        response = self._session.post('project', json={
-            "description": description,
-            "key": project_key,
-            "name": name
-        }).json()
+        try:
+            response = self._session.post('project', json={
+                "description": description,
+                "key": project_key,
+                "name": name
+            }).json()
+        except GiskardError as e:
+            if e.code == 'error.http.409':
+                warnings.warn("This project key already exists. "
+                              "If you want to reuse existing project use get_project(“project_key”) instead")
+            raise e
         actual_project_key = response.get('key')
         if actual_project_key != project_key:
             print(f"Project created with a key : {actual_project_key}")

diff --git a/giskard/project.py b/giskard/project.py
@@ -1,5 +1,5 @@
 import json
-import logging
+import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
 import numpy as np
@@ -23,7 +23,6 @@ def _serialize(prediction_function: Callable[
         Iterable[Union[str, float, int]],
     ]) -> bytes:
         compressed_pickle: bytes = compress(pickle_dumps(prediction_function))
-        print(f'Compressed model size: {len(compressed_pickle)} bytes')
         return compressed_pickle
 
     def upload_model(
@@ -32,19 +31,24 @@ def upload_model(
             model_type: str,
             feature_names: List[str],
             name: str = None,
-            classification_threshold: Optional[float] = 0.5,
+            validate_df: pd.DataFrame = None,
+            target: Optional[List[str]] = None,
+            classification_threshold: Optional[float] = None,
             classification_labels: Optional[List[str]] = None,
-            validate_df: pd.DataFrame = None
     ):
-        print(f"Uploading model '{name}' to project '{self.project_key}'...")
-
-        self._validate_classification_threshold(classification_threshold)
         self._validate_model_type(model_type)
+        self._validate_features(feature_names=feature_names, validate_df=validate_df)
         self._validate_prediction_function(prediction_function)
         classification_labels = self._validate_classification_labels(classification_labels, model_type)
 
+        if model_type == SupportedModelTypes.CLASSIFICATION.value:
+            self._validate_classification_threshold_label(classification_labels, classification_threshold)
+
         if validate_df is not None:
-            self._validate_model_execution(prediction_function, validate_df)
+            self._validate_model_execution(prediction_function, validate_df, model_type, classification_labels)
+            if target is not None and model_type == SupportedModelTypes.CLASSIFICATION.value:
+                target_values = validate_df[target].unique()
+                self._validate_label_with_target(classification_labels, target_values)
 
         model = self._serialize(prediction_function)
         requirements = get_python_requirements()
@@ -65,23 +69,26 @@ def upload_model(
             ('requirementsFile', requirements)
         ]
         self.session.post('project/models/upload', data={}, files=files)
-        print(f"Uploading model '{name}' to project '{self.project_key}': Done!")
+        print(f"Successfully uploaded model to project '{self.project_key}'")
 
     def upload_df(
             self,
             df: pd.DataFrame,
-            feature_types: Dict[str, str],
+            column_types: Dict[str, str],
             target: str = None,
             name: str = None,
     ) -> requests.Response:
-        logging.info(f"Uploading dataset '{name}' to project '{self.project_key}'...")
-        self.validate_df(df, feature_types)
-        self._validate_input_types(feature_types)
+        self._validate_features(column_types=column_types)
+        if target is not None:
+            self._validate_target(target, df.keys())
+        self.validate_df(df, column_types)
+        self._validate_input_types(column_types)
+
         data = compress(save_df(df))
         params = {
             "projectKey": self.project_key,
             "name": name,
-            "featureTypes": feature_types,
+            "featureTypes": column_types,
             "target": target
         }
 
@@ -90,49 +97,50 @@ def upload_df(
             ('file', data)
         ]
 
-        logging.info(f"Uploading dataset '{name}' to project '{self.project_key}': Done!")
+        print(f"Successfully uploaded dataset to project '{self.project_key}'")
         return self.session.post("project/data/upload", data={}, files=files)
 
     def upload_model_and_df(
             self,
             prediction_function: Callable[[pd.DataFrame], Iterable[Union[str, float, int]]],
-            prediction_task: str,
-            feature_names: List[str],
+            model_type: str,
             df: pd.DataFrame,
-            feature_types: Dict[str, str],
-            target: str,
+            column_types: Dict[str, str],
+            feature_names: List[str] = None,
+            target: str = None,
             model_name: str = None,
             dataset_name: str = None,
-            classification_threshold: Optional[float] = 0.5,
+            classification_threshold: Optional[float] = None,
             classification_labels: Optional[List[str]] = None,
     ) -> None:
-        self.upload_model(prediction_function,
-                          prediction_task,
-                          feature_names,
-                          model_name,
-                          classification_threshold,
-                          classification_labels,
-                          df)
+        self.upload_model(prediction_function=prediction_function,
+                          model_type=model_type,
+                          feature_names=feature_names or list(column_types.keys()),
+                          name=model_name,
+                          classification_threshold=classification_threshold,
+                          classification_labels=classification_labels,
+                          validate_df=df,
+                          target=target)
         self.upload_df(
             df=df,
             name=dataset_name,
-            feature_types=feature_types,
+            column_types=column_types,
             target=target)
 
     @staticmethod
-    def _validate_model_type(prediction_task):
-        if prediction_task not in {task.value for task in SupportedModelTypes}:
+    def _validate_model_type(model_type):
+        if model_type not in {task.value for task in SupportedModelTypes}:
             raise ValueError(
-                f"Invalid prediction_task parameter: {prediction_task}. "
+                f"Invalid model_type parameter: {model_type}. "
                 + f"Please choose one of {[task.value for task in SupportedModelTypes]}."
             )
 
     @staticmethod
     def _validate_input_types(input_types):
         if input_types and type(input_types) is dict:
-            if set(input_types.values()) > {column_type.value for column_type in SupportedColumnType}:
+            if not set(input_types.values()).issubset(set(column_type.value for column_type in SupportedColumnType)):
                 raise ValueError(
-                    f"Invalid input_types parameter: {input_types}. "
+                    f"Invalid input_types parameter: "
                     + f"Please choose types among {[column_type.value for column_type in SupportedColumnType]}."
                 )
         else:
@@ -148,16 +156,62 @@ def _validate_prediction_function(prediction_function):
             )
 
     @staticmethod
-    def _validate_classification_threshold(classification_threshold):
+    def _validate_target(target, dataframe_keys):
+        if target is not None and target not in dataframe_keys:
+            raise ValueError(
+                f"Invalid target parameter: "
+                f" Select the target from the column names of the dataset: {dataframe_keys}")
+
+    @staticmethod
+    def _validate_features(feature_names=None, column_types=None, validate_df=None):
+        if feature_names is not None:
+            if not isinstance(feature_names, list):
+                raise ValueError(
+                    f"Invalid feature_names parameter. Please provide the feature names as a list."
+                )
+            if validate_df is not None:
+                if not set(feature_names).issubset(set(validate_df.columns)):
+                    missing_columns = set(feature_names) - set(validate_df.columns)
+                    raise ValueError(
+                        f"Value mentioned in feature_names is not available in validate_df: {missing_columns} ")
+
+        if column_types is not None and not isinstance(column_types, dict):
+            raise ValueError(
+                f"Invalid column_types parameter. Please provide the feature names as a dictionary."
+            )
+
+    @staticmethod
+    def _validate_classification_threshold_label(classification_labels, classification_threshold=None):
+        if classification_labels is None:
+            raise ValueError(
+                f"Missing classification_labels parameter for classification model."
+            )
         if classification_threshold is not None and not isinstance(classification_threshold, (int, float)):
             raise ValueError(
                 f"Invalid classification_threshold parameter: {classification_threshold}. Please specify valid number."
             )
 
+        if classification_threshold is not None:
+            if classification_threshold != 0.5:
+                if len(classification_labels) != 2:
+                    raise ValueError(
+                        f"Invalid classification_threshold parameter: {classification_threshold} value is applicable "
+                        f"only for binary classification. "
+                    )
+
+    @staticmethod
+    def _validate_label_with_target(classification_labels, target_values=None):
+        if target_values is not None:
+            if set(target_values) != set(classification_labels):
+                raise ValueError(
+                    f"Invalid classification_labels parameter: {classification_labels} do not match with"
+                    f" target column values{target_values}."
+                )
+
     @staticmethod
-    def _validate_classification_labels(classification_labels, prediction_task):
+    def _validate_classification_labels(classification_labels, model_type):
         res = None
-        if prediction_task == SupportedModelTypes.CLASSIFICATION.value:
+        if model_type == SupportedModelTypes.CLASSIFICATION.value:
             if (
                     classification_labels is not None
                     and hasattr(classification_labels, "__iter__")
@@ -167,30 +221,48 @@ def _validate_classification_labels(classification_labels, prediction_task):
                     res: Optional[List[str]] = [str(label) for label in classification_labels]
                 else:
                     raise ValueError(
-                        f"Invalid classification_labels parameter: {classification_labels}. Please specify more than 1 label."
+                        f"Invalid classification_labels parameter: {classification_labels}. "
+                        f"Please specify more than 1 label."
                     )
             else:
                 raise ValueError(
-                    f"Invalid classification_labels parameter: {classification_labels}. Please specify valid list of strings."
+                    f"Invalid classification_labels parameter: {classification_labels}. "
+                    f"Please specify valid list of strings."
                 )
+        if model_type == SupportedModelTypes.REGRESSION.value and classification_labels is not None:
+            warnings.warn("'classification_labels' parameter is ignored for regression model")
+            res = None
         return res
 
     @staticmethod
-    def _validate_model_execution(prediction_function, df: pd.DataFrame) -> None:
+    def _validate_model_execution(prediction_function, df: pd.DataFrame, model_type, classification_labels) -> None:
         prediction = prediction_function(df)
-        if not isinstance(prediction, np.ndarray):
-            raise ValueError("Model should return numpy array")
+        if isinstance(prediction, np.ndarray) or isinstance(prediction, list):
+            if model_type == SupportedModelTypes.CLASSIFICATION.value:
+                if not any(isinstance(y, float) for x in prediction for y in x):
+                    raise ValueError("Model prediction should return float values ")
+            if model_type == SupportedModelTypes.REGRESSION.value:
+                if not any(isinstance(x, float) for x in prediction):
+                    raise ValueError("Model prediction should return float values ")
+        else:
+            raise ValueError("Model should return numpy array or a list")
+
+        GiskardProject._validate_classification_prediction(classification_labels, model_type, prediction)
+
+    @staticmethod
+    def _validate_classification_prediction(classification_labels, model_type, prediction):
+        if model_type == SupportedModelTypes.CLASSIFICATION.value:
+            if np.all(np.round(np.sum(prediction, axis=1), 2) != 1):
+                raise ValueError("Invalid Classification Model prediction. Sum of all probabilities should be 1 ")
+            if prediction.shape[1] != len(classification_labels):
+                raise ValueError("Prediction output label shape and classification_labels shape do not match")
 
     @staticmethod
     def validate_df(df: pd.DataFrame, input_types) -> pd.DataFrame:
-        if set(input_types.values()) < set(df.columns):
-            missing_columns = set(df.columns) - set(input_types.values())
-            raise ValueError(f"Missing input_types for columns: {missing_columns}")
-        elif set(input_types.values()) > set(df.columns):
-            missing_columns = set(input_types.values()) - set(df.columns)
-            raise ValueError(
-                f"Missing columns in dataframe according to input_types: {missing_columns}"
-            )
+        if not set(input_types.keys()).issubset(set(df.columns)):
+            missing_columns = set(input_types.keys()) - set(df.columns)
+            raise ValueError(f"Value mentioned in column_types is not available in dataframe: {missing_columns} ")
+
         else:
             pandas_inferred_input_types = df.dtypes.to_dict()
             for column, dtype in pandas_inferred_input_types.items():

diff --git a/tests/test_upload.py b/tests/test_upload.py
@@ -1,6 +1,6 @@
 import re
 from io import BytesIO
-
+import pytest
 import httpretty
 import numpy as np
 import pandas as pd
@@ -18,11 +18,26 @@ def test_upload_df(diabetes_dataset):
         httpretty.POST,
         "http://giskard-host:12345/api/v2/project/data/upload"
     )
-
     df, input_types, target = diabetes_dataset
+    dataset_name = "diabetes dataset"
     client = GiskardClient("http://giskard-host:12345", "SECRET_TOKEN")
     project = GiskardProject(client.session, "test-project")
-    project.upload_df(df, input_types, target, name="diabetes dataset")
+
+    with pytest.raises(Exception):  # Error Scenario
+        project.upload_df(
+            df=df,
+            column_types=input_types,
+            target=target,
+            name=dataset_name)
+    with pytest.raises(Exception):  # Error Scenario
+        project.upload_df(df=df,
+                          column_types={"test":"test"},
+                          name=dataset_name)
+
+    project.upload_df(df=df,
+                      column_types=input_types,
+                      name=dataset_name)
+
     req = httpretty.last_request()
     assert req.headers.get('Authorization') == 'Bearer SECRET_TOKEN'
     assert int(req.headers.get('Content-Length')) > 0
@@ -47,12 +62,20 @@ def test_upload_model(linear_regression_diabetes: GiskardModel, diabetes_dataset
     client = GiskardClient("http://giskard-host:12345", "SECRET_TOKEN")
     project = GiskardProject(client.session, "test-project")
     project.upload_model(
-        model.prediction_function,
-        model.model_type,
-        model.feature_names,
-        "uploaded model",
+        prediction_function=model.prediction_function,
+        model_type=model.model_type,
+        feature_names=model.feature_names,
+        name="uploaded model",
         validate_df=df
     )
+    with pytest.raises(Exception):
+        project.upload_model(
+            prediction_function=model.prediction_function,
+            model_type=model.model_type,
+            feature_names=input_types,
+            name="uploaded model",
+            validate_df=df
+        )
     req = httpretty.last_request()
     assert req.headers.get('Authorization') == 'Bearer SECRET_TOKEN'
     assert int(req.headers.get('Content-Length')) > 0