feat: complete the verification part

SocialTensor · Nov 15, 2024 · 2c05210 · 2c05210
1 parent 68580c5
commit 2c05210
Show file tree

Hide file tree

Showing 11 changed files with 546 additions and 47 deletions.
diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
@@ -87,6 +87,8 @@ jobs:
       if: github.event_name == 'push' && github.ref == 'refs/heads/main'
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       with:
         tag_name: v${{ github.run_number }}
         name: Release v${{ github.run_number }}

diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@ demo/input/.DS_Store
 .env
 temp/
 venv
-input/
+input/
+temp.txt
diff --git a/README.md b/README.md
@@ -66,7 +66,7 @@ docker build -t my-proof .
 ```
 
 ```
-docker run --rm --volume $(pwd)/demo/sealed:/sealed --volume $(pwd)/demo/input:/input --volume $(pwd)/demo/output:/output --env [email protected] my-proof
+docker run --rm --volume $(pwd)/demo/sealed:/sealed --volume $(pwd)/demo/input:/input --volume $(pwd)/demo/output:/output -e AWS_ACCESS_KEY_ID=<your-access-key-id> -e AWS_SECRET_ACCESS_KEY=<your-secret-access-key> my-proof
 ```
 
 ## Building and Releasing
@@ -119,7 +119,7 @@ curl -L https://address/of/gsc-my-proof.tar.gz | docker load
 To run the image:
 
 ```
-docker run --rm --volume /gsc-my-proof/input:/input --volume /gsc-my-proof/output:/output --device /dev/sgx_enclave:/dev/sgx_enclave --volume /var/run/aesmd:/var/run/aesmd --volume /mnt/gsc-my-proof/sealed:/sealed --env [email protected] gsc-my-proof
+docker run --rm --volume /gsc-my-proof/input:/input --volume /gsc-my-proof/output:/output --device /dev/sgx_enclave:/dev/sgx_enclave --volume /var/run/aesmd:/var/run/aesmd --volume /mnt/gsc-my-proof/sealed:/sealed gsc-my-proof
 ```
 
 Remember to populate the `/input` directory with the files you want to process.

diff --git a/demo/input/archive.zip b/demo/input/archive.zip
diff --git a/my_proof/__main__.py b/my_proof/__main__.py
@@ -19,7 +19,9 @@ def load_config() -> Dict[str, Any]:
         'dlp_id': 5,
         'use_sealing': os.path.isdir(SEALED_DIR),
         'input_dir': INPUT_DIR,
-        'user_email': os.environ.get('USER_EMAIL', None),
+        'aws_access_key_id': os.environ.get('AWS_ACCESS_KEY_ID', None),
+        'aws_secret_access_key': os.environ.get('AWS_SECRET_ACCESS_KEY', None),
+        # 'user_email': os.environ.get('USER_EMAIL', None),
     }
     logging.info(f"Using config: {json.dumps(config, indent=2)}")
     return config
@@ -32,7 +34,7 @@ def run() -> None:
 
     if not input_files_exist:
         raise FileNotFoundError(f"No input files found in {INPUT_DIR}")
-    extract_input()
+    # extract_input()
 
     proof = Proof(config)
     proof_response = proof.generate()

diff --git a/my_proof/aws_interaction.py b/my_proof/aws_interaction.py
@@ -0,0 +1,27 @@
+import boto3
+import json
+import os
+from datetime import datetime
+
+def download_json_from_s3(bucket_name, file_key, aws_access_key_id, aws_secret_access_key):
+
+    # Initialize S3 client
+    s3 = boto3.client(
+        's3',
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key
+    )
+
+    try:
+        # Download the file from S3
+        response = s3.get_object(Bucket=bucket_name, Key=file_key)
+
+        # Read the content and parse it as JSON
+        content = response['Body'].read().decode('utf-8')
+        json_data = json.loads(content)
+
+        return json_data
+    except Exception as e:
+        print(f"Error downloading or parsing JSON from S3: {str(e)}")
+        return None
+
diff --git a/my_proof/hash_manager.py b/my_proof/hash_manager.py
@@ -0,0 +1,96 @@
+import boto3
+import json
+from datetime import datetime
+import logging
+import hashlib
+
+class HashManager:
+    def __init__(self, bucket_name, remote_file_key, aws_access_key_id, aws_secret_access_key):
+        # Initialize S3 client with credentials
+        self.s3_client = boto3.client(
+            's3',
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key
+        )
+        self.bucket_name = bucket_name
+        self.remote_file_key = remote_file_key
+
+    def _initialize_empty_hash_file(self):
+        """Initialize an empty hash file in S3"""
+        data = {
+            'hashes': [],
+            'lastUpdated': datetime.utcnow().isoformat() + 'Z'
+        }
+        self.s3_client.put_object(
+            Bucket=self.bucket_name,
+            Key=self.remote_file_key,
+            Body=json.dumps(data, indent=2),
+            ContentType='application/json'
+        )
+        return []
+
+    def get_remote_hashes(self):
+        """Fetch hashes from remote S3 JSON file"""
+        try:
+            response = self.s3_client.get_object(
+                Bucket=self.bucket_name,
+                Key=self.remote_file_key
+            )
+            data = json.loads(response['Body'].read().decode('utf-8'))
+            return data.get('hashes', [])
+        except self.s3_client.exceptions.NoSuchKey:
+            # If file doesn't exist, create it and return empty list
+            return self._initialize_empty_hash_file()
+        except Exception as e:
+            logging.error(f"Error fetching remote hashes: {str(e)}")
+            raise
+
+    def update_remote_hashes(self, new_hashes):
+        """Update remote JSON file with new hashes"""
+        try:
+            data = {
+                'hashes': new_hashes,
+                'lastUpdated': datetime.utcnow().isoformat() + 'Z'
+            }
+
+            self.s3_client.put_object(
+                Bucket=self.bucket_name,
+                Key=self.remote_file_key,
+                Body=json.dumps(data, indent=2),
+                ContentType='application/json'
+            )
+            return True
+        except Exception as e:
+            logging.error(f"Error updating remote hashes: {str(e)}")
+            raise
+
+    def add_hash(self, new_hash):
+        """Add a single hash to the remote file"""
+        current_hashes = self.get_remote_hashes()
+        if new_hash not in current_hashes:
+            current_hashes.append(new_hash)
+            self.update_remote_hashes(current_hashes)
+            return True
+        return False
+
+    def remove_hash(self, hash_to_remove):
+        """Remove a hash from the remote file"""
+        current_hashes = self.get_remote_hashes()
+        if hash_to_remove in current_hashes:
+            current_hashes.remove(hash_to_remove)
+            self.update_remote_hashes(current_hashes)
+            return True
+        return False
+
+    def generate_hash(self, input_string):
+        """Generate a SHA-256 hash from an input string
+        
+        Args:
+            input_string (str): The string to hash
+            
+        Returns:
+            str: The hexadecimal representation of the hash
+        """
+        # Encode the string to bytes and generate hash
+        hash_object = hashlib.sha256(str(input_string).encode())
+        return hash_object.hexdigest()
diff --git a/my_proof/models/proof_response.py b/my_proof/models/proof_response.py
@@ -24,9 +24,11 @@ class ProofResponse(BaseModel):
     dlp_id: int
     valid: bool = False
     score: float = 0.0
-    authenticity: float = 0.0
-    ownership: float = 0.0
-    quality: float = 0.0
+    time_minimums: float = 0.0
+    time_correlation: float = 0.0
+    time_distribution: float = 0.0
+    repeat_anwsers: float = 0.0
+    both_sides: float = 0.0
+    model_distribution: float = 0.0
+    poison_data: float = 0.0
     uniqueness: float = 0.0
-    attributes: Optional[Dict[str, Any]] = {}
-    metadata: Optional[Dict[str, Any]] = {}
diff --git a/my_proof/proof.py b/my_proof/proof.py
@@ -1,67 +1,166 @@
 import json
 import logging
 import os
-from typing import Dict, Any
+from typing import Dict, Any, List
+from my_proof.hash_manager import HashManager
+from rich.console import Console
+from rich.table import Table
 
 import requests
 
 from my_proof.models.proof_response import ProofResponse
-
+from my_proof.tests import *
+
+top_weights = {
+    'Authenticity':0.2,
+    'Quality':0.7,
+    'Uniquness':0.1
+}
+test_weights = {
+    'Time_Minimums':0.1,
+    'Time_Correlation':0.2,
+    'Time_Distribution':0.1,
+    'Repeat_Anwsers':0.15,
+    'Both_Sides':0.15,
+    'Model_Distribution':0.05,
+    'Poisin_Data':0.25,
+}
 
 class Proof:
     def __init__(self, config: Dict[str, Any]):
         self.config = config
+        logging.info(f"Config: {self.config}")
         self.proof_response = ProofResponse(dlp_id=config['dlp_id'])
+        self.aws_access_key_id = config['aws_access_key_id']
+        self.aws_secret_access_key = config['aws_secret_access_key']
 
     def generate(self) -> ProofResponse:
         """Generate proofs for all input files."""
         logging.info("Starting proof generation")
 
         # Iterate through files and calculate data validity
-        account_email = None
-        total_score = 0
+        # account_email = None
+        # total_score = 0
 
         for input_filename in os.listdir(self.config['input_dir']):
             input_file = os.path.join(self.config['input_dir'], input_filename)
             if os.path.splitext(input_file)[1].lower() == '.json':
                 with open(input_file, 'r') as f:
                     input_data = json.load(f)
 
-                    if input_filename == 'account.json':
-                        account_email = input_data.get('email', None)
-                        continue
-
-                    elif input_filename == 'activity.json':
-                        total_score = sum(item['score'] for item in input_data)
-                        continue
-
-        email_matches = self.config['user_email'] == account_email
-        score_threshold = fetch_random_number()
-
-        # Calculate proof-of-contribution scores: https://docs.vana.org/vana/core-concepts/key-elements/proof-of-contribution/example-implementation
-        self.proof_response.ownership = 1.0 if email_matches else 0.0  # Does the data belong to the user? Or is it fraudulent?
-        self.proof_response.quality = max(0, min(total_score / score_threshold, 1.0))  # How high quality is the data?
-        self.proof_response.authenticity = 0  # How authentic is the data is (ie: not tampered with)? (Not implemented here)
-        self.proof_response.uniqueness = 0  # How unique is the data relative to other datasets? (Not implemented here)
-
-        # Calculate overall score and validity
-        self.proof_response.score = 0.6 * self.proof_response.quality + 0.4 * self.proof_response.ownership
-        self.proof_response.valid = email_matches and total_score >= score_threshold
-
-        # Additional (public) properties to include in the proof about the data
-        self.proof_response.attributes = {
-            'total_score': total_score,
-            'score_threshold': score_threshold,
-            'email_verified': email_matches,
-        }
-
-        # Additional metadata about the proof, written onchain
-        self.proof_response.metadata = {
-            'dlp_id': self.config['dlp_id'],
-        }
+        qualityRes = Quality(input_data, self.aws_access_key_id, self.aws_secret_access_key)
+        self.proof_response.score = qualityRes['score']
+        self.proof_response.valid = qualityRes['score'] > 0.65
+        self.proof_response.time_minimums = qualityRes['Time_Minimums']['score']
+        self.proof_response.time_correlation = qualityRes['Time_Correlation']['score']
+        self.proof_response.time_distribution = qualityRes['Time_Distribution']['score']
+        self.proof_response.repeat_anwsers = qualityRes['Repeat_Anwsers']['score']
+        self.proof_response.both_sides = qualityRes['Both_Sides']['score']
+        self.proof_response.model_distribution = qualityRes['Model_Distribution']['score']
+        self.proof_response.poison_data = qualityRes['Poisin_Data']['score']
+
+        self.proof_response.uniqueness = Uniqueness(input_data, self.aws_access_key_id, self.aws_secret_access_key)
 
         return self.proof_response
 
+def Quality(data_list: List[Dict[str, Any]], aws_access_key_id: str, aws_secret_access_key: str) -> float:
+    #all tests
+    #average time taken is less than 5 seconds
+    #time correlates to time
+    #distribution in times taken
+    #anwsering repeat questions the same way,Check for duplicate uniqueIDs with different 'chosen' values
+    #choosing option 1 and option 2, Analyze the distribution of 'chosen' values
+    #Check for model bias in 'chosen' responses, might be dumb we expect 70b to do better than 7b so will not be even distribution
+    #Perform randomness test using Chi-squared test, make sure there is some distribution in what gets chosen
+    #Check if it is poisoned data and make sure that they chose the same response
+    # 8 seperate tests
+    report = {
+        'Time_Minimums':Time_Minimums(data_list),
+        'Time_Correlation':Character_Timing(data_list),
+        'Time_Distribution':Time_Distribution(data_list),
+        'Repeat_Anwsers':Duplicate_ID_Check(data_list),
+        'Both_Sides':Choice_Distribution(data_list),
+        'Model_Distribution':Model_Bias(data_list),
+        'Poisin_Data':Poison_Consistency(data_list, aws_access_key_id, aws_secret_access_key),
+        'score':0
+    }
+    report['score'] = sum(test_weights[test] * report[test]['score'] for test in test_weights)
+    print(report)
+    display_report(report)
+    return report
+
+def Uniqueness(data_list: List[Dict[str, Any]], aws_access_key_id: str, aws_secret_access_key: str) -> float:
+    hash_manager = HashManager(bucket_name="vanatensordlp", remote_file_key="verified_hashes/hashes.json", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
+    hash_exists = hash_manager.generate_hash(data_list) in hash_manager.get_remote_hashes()
+    if hash_exists:
+        return 0.0
+    else:
+        hash_manager.add_hash(hash_manager.generate_hash(data_list))
+        return 1.0
+
+def display_report(report: dict) -> None:
+    console = Console()
+
+    # Create main score table
+    main_score = Table(title="[bold magenta]Quality Assessment Report[/bold magenta]", 
+                      show_header=True,
+                      header_style="bold cyan")
+    main_score.add_column("Overall Score", justify="center", style="bold")
+    main_score.add_row(f"{report['score']:.2%}")
+
+    # Create detailed results table
+    results = Table(show_header=True, header_style="bold cyan", 
+                   title="[bold magenta]Detailed Test Results[/bold magenta]")
+    results.add_column("Test", style="bold green")
+    results.add_column("Score", justify="center")
+    results.add_column("Status", justify="center")
+    results.add_column("Details", justify="left")
+
+    # Test result emojis
+    PASS = "✅"
+    PARTIAL = "⚠️"
+    FAIL = "❌"
+
+    # Mapping of score ranges to status
+    def get_status(score):
+        if score >= 0.8: return (PASS, "green")
+        if score >= 0.4: return (PARTIAL, "yellow")
+        return (FAIL, "red")
+
+    # Add each test result
+    for test_name, data in report.items():
+        if test_name == 'score':
+            continue
+
+        score = data['score']
+        status_emoji, color = get_status(score)
+
+        # Format comments as a single string with line breaks
+        comments = '\n'.join(data['comments'])
+
+        results.add_row(
+            test_name.replace('_', ' '),
+            f"[{color}]{score:.2%}[/{color}]",
+            status_emoji,
+            comments
+        )
+
+    # Print the report
+    console.print()
+    console.print(main_score, justify="center")
+    console.print()
+    console.print(results)
+    console.print()
+
+    # Add a summary footer
+    if report['score'] >= 0.8:
+        console.print("[bold green]Overall Assessment: EXCELLENT[/bold green]", justify="center")
+    elif report['score'] >= 0.6:
+        console.print("[bold yellow]Overall Assessment: GOOD[/bold yellow]", justify="center")
+    elif report['score'] >= 0.4:
+        console.print("[bold yellow]Overall Assessment: FAIR[/bold yellow]", justify="center")
+    else:
+        console.print("[bold red]Overall Assessment: NEEDS IMPROVEMENT[/bold red]", justify="center")
 
 def fetch_random_number() -> float:
     """Demonstrate HTTP requests by fetching a random number from random.org."""
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,4 +11,5 @@ demo/input/.DS_Store @@
     .env
     temp/
     venv
-    input/
+    input/
+    temp.txt