Skip to content

Commit

Permalink
feat: complete the verification part
Browse files Browse the repository at this point in the history
  • Loading branch information
larry-neil-1206 committed Nov 15, 2024
1 parent 68580c5 commit 2c05210
Show file tree
Hide file tree
Showing 11 changed files with 546 additions and 47 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build-and-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ jobs:
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
with:
tag_name: v${{ github.run_number }}
name: Release v${{ github.run_number }}
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ demo/input/.DS_Store
.env
temp/
venv
input/
input/
temp.txt
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ docker build -t my-proof .
```

```
docker run --rm --volume $(pwd)/demo/sealed:/sealed --volume $(pwd)/demo/input:/input --volume $(pwd)/demo/output:/output --env [email protected] my-proof
docker run --rm --volume $(pwd)/demo/sealed:/sealed --volume $(pwd)/demo/input:/input --volume $(pwd)/demo/output:/output -e AWS_ACCESS_KEY_ID=<your-access-key-id> -e AWS_SECRET_ACCESS_KEY=<your-secret-access-key> my-proof
```

## Building and Releasing
Expand Down Expand Up @@ -119,7 +119,7 @@ curl -L https://address/of/gsc-my-proof.tar.gz | docker load
To run the image:

```
docker run --rm --volume /gsc-my-proof/input:/input --volume /gsc-my-proof/output:/output --device /dev/sgx_enclave:/dev/sgx_enclave --volume /var/run/aesmd:/var/run/aesmd --volume /mnt/gsc-my-proof/sealed:/sealed --env [email protected] gsc-my-proof
docker run --rm --volume /gsc-my-proof/input:/input --volume /gsc-my-proof/output:/output --device /dev/sgx_enclave:/dev/sgx_enclave --volume /var/run/aesmd:/var/run/aesmd --volume /mnt/gsc-my-proof/sealed:/sealed gsc-my-proof
```

Remember to populate the `/input` directory with the files you want to process.
Expand Down
Binary file removed demo/input/archive.zip
Binary file not shown.
6 changes: 4 additions & 2 deletions my_proof/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ def load_config() -> Dict[str, Any]:
'dlp_id': 5,
'use_sealing': os.path.isdir(SEALED_DIR),
'input_dir': INPUT_DIR,
'user_email': os.environ.get('USER_EMAIL', None),
'aws_access_key_id': os.environ.get('AWS_ACCESS_KEY_ID', None),
'aws_secret_access_key': os.environ.get('AWS_SECRET_ACCESS_KEY', None),
# 'user_email': os.environ.get('USER_EMAIL', None),
}
logging.info(f"Using config: {json.dumps(config, indent=2)}")
return config
Expand All @@ -32,7 +34,7 @@ def run() -> None:

if not input_files_exist:
raise FileNotFoundError(f"No input files found in {INPUT_DIR}")
extract_input()
# extract_input()

proof = Proof(config)
proof_response = proof.generate()
Expand Down
27 changes: 27 additions & 0 deletions my_proof/aws_interaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import boto3
import json
import os
from datetime import datetime

def download_json_from_s3(bucket_name, file_key, aws_access_key_id, aws_secret_access_key):

# Initialize S3 client
s3 = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key
)

try:
# Download the file from S3
response = s3.get_object(Bucket=bucket_name, Key=file_key)

# Read the content and parse it as JSON
content = response['Body'].read().decode('utf-8')
json_data = json.loads(content)

return json_data
except Exception as e:
print(f"Error downloading or parsing JSON from S3: {str(e)}")
return None

96 changes: 96 additions & 0 deletions my_proof/hash_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import boto3
import json
from datetime import datetime
import logging
import hashlib

class HashManager:
def __init__(self, bucket_name, remote_file_key, aws_access_key_id, aws_secret_access_key):
# Initialize S3 client with credentials
self.s3_client = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key
)
self.bucket_name = bucket_name
self.remote_file_key = remote_file_key

def _initialize_empty_hash_file(self):
"""Initialize an empty hash file in S3"""
data = {
'hashes': [],
'lastUpdated': datetime.utcnow().isoformat() + 'Z'
}
self.s3_client.put_object(
Bucket=self.bucket_name,
Key=self.remote_file_key,
Body=json.dumps(data, indent=2),
ContentType='application/json'
)
return []

def get_remote_hashes(self):
"""Fetch hashes from remote S3 JSON file"""
try:
response = self.s3_client.get_object(
Bucket=self.bucket_name,
Key=self.remote_file_key
)
data = json.loads(response['Body'].read().decode('utf-8'))
return data.get('hashes', [])
except self.s3_client.exceptions.NoSuchKey:
# If file doesn't exist, create it and return empty list
return self._initialize_empty_hash_file()
except Exception as e:
logging.error(f"Error fetching remote hashes: {str(e)}")
raise

def update_remote_hashes(self, new_hashes):
"""Update remote JSON file with new hashes"""
try:
data = {
'hashes': new_hashes,
'lastUpdated': datetime.utcnow().isoformat() + 'Z'
}

self.s3_client.put_object(
Bucket=self.bucket_name,
Key=self.remote_file_key,
Body=json.dumps(data, indent=2),
ContentType='application/json'
)
return True
except Exception as e:
logging.error(f"Error updating remote hashes: {str(e)}")
raise

def add_hash(self, new_hash):
"""Add a single hash to the remote file"""
current_hashes = self.get_remote_hashes()
if new_hash not in current_hashes:
current_hashes.append(new_hash)
self.update_remote_hashes(current_hashes)
return True
return False

def remove_hash(self, hash_to_remove):
"""Remove a hash from the remote file"""
current_hashes = self.get_remote_hashes()
if hash_to_remove in current_hashes:
current_hashes.remove(hash_to_remove)
self.update_remote_hashes(current_hashes)
return True
return False

def generate_hash(self, input_string):
"""Generate a SHA-256 hash from an input string
Args:
input_string (str): The string to hash
Returns:
str: The hexadecimal representation of the hash
"""
# Encode the string to bytes and generate hash
hash_object = hashlib.sha256(str(input_string).encode())
return hash_object.hexdigest()
12 changes: 7 additions & 5 deletions my_proof/models/proof_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ class ProofResponse(BaseModel):
dlp_id: int
valid: bool = False
score: float = 0.0
authenticity: float = 0.0
ownership: float = 0.0
quality: float = 0.0
time_minimums: float = 0.0
time_correlation: float = 0.0
time_distribution: float = 0.0
repeat_anwsers: float = 0.0
both_sides: float = 0.0
model_distribution: float = 0.0
poison_data: float = 0.0
uniqueness: float = 0.0
attributes: Optional[Dict[str, Any]] = {}
metadata: Optional[Dict[str, Any]] = {}
171 changes: 135 additions & 36 deletions my_proof/proof.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,166 @@
import json
import logging
import os
from typing import Dict, Any
from typing import Dict, Any, List
from my_proof.hash_manager import HashManager
from rich.console import Console
from rich.table import Table

import requests

from my_proof.models.proof_response import ProofResponse

from my_proof.tests import *

top_weights = {
'Authenticity':0.2,
'Quality':0.7,
'Uniquness':0.1
}
test_weights = {
'Time_Minimums':0.1,
'Time_Correlation':0.2,
'Time_Distribution':0.1,
'Repeat_Anwsers':0.15,
'Both_Sides':0.15,
'Model_Distribution':0.05,
'Poisin_Data':0.25,
}

class Proof:
def __init__(self, config: Dict[str, Any]):
self.config = config
logging.info(f"Config: {self.config}")
self.proof_response = ProofResponse(dlp_id=config['dlp_id'])
self.aws_access_key_id = config['aws_access_key_id']
self.aws_secret_access_key = config['aws_secret_access_key']

def generate(self) -> ProofResponse:
"""Generate proofs for all input files."""
logging.info("Starting proof generation")

# Iterate through files and calculate data validity
account_email = None
total_score = 0
# account_email = None
# total_score = 0

for input_filename in os.listdir(self.config['input_dir']):
input_file = os.path.join(self.config['input_dir'], input_filename)
if os.path.splitext(input_file)[1].lower() == '.json':
with open(input_file, 'r') as f:
input_data = json.load(f)

if input_filename == 'account.json':
account_email = input_data.get('email', None)
continue

elif input_filename == 'activity.json':
total_score = sum(item['score'] for item in input_data)
continue

email_matches = self.config['user_email'] == account_email
score_threshold = fetch_random_number()

# Calculate proof-of-contribution scores: https://docs.vana.org/vana/core-concepts/key-elements/proof-of-contribution/example-implementation
self.proof_response.ownership = 1.0 if email_matches else 0.0 # Does the data belong to the user? Or is it fraudulent?
self.proof_response.quality = max(0, min(total_score / score_threshold, 1.0)) # How high quality is the data?
self.proof_response.authenticity = 0 # How authentic is the data is (ie: not tampered with)? (Not implemented here)
self.proof_response.uniqueness = 0 # How unique is the data relative to other datasets? (Not implemented here)

# Calculate overall score and validity
self.proof_response.score = 0.6 * self.proof_response.quality + 0.4 * self.proof_response.ownership
self.proof_response.valid = email_matches and total_score >= score_threshold

# Additional (public) properties to include in the proof about the data
self.proof_response.attributes = {
'total_score': total_score,
'score_threshold': score_threshold,
'email_verified': email_matches,
}

# Additional metadata about the proof, written onchain
self.proof_response.metadata = {
'dlp_id': self.config['dlp_id'],
}
qualityRes = Quality(input_data, self.aws_access_key_id, self.aws_secret_access_key)
self.proof_response.score = qualityRes['score']
self.proof_response.valid = qualityRes['score'] > 0.65
self.proof_response.time_minimums = qualityRes['Time_Minimums']['score']
self.proof_response.time_correlation = qualityRes['Time_Correlation']['score']
self.proof_response.time_distribution = qualityRes['Time_Distribution']['score']
self.proof_response.repeat_anwsers = qualityRes['Repeat_Anwsers']['score']
self.proof_response.both_sides = qualityRes['Both_Sides']['score']
self.proof_response.model_distribution = qualityRes['Model_Distribution']['score']
self.proof_response.poison_data = qualityRes['Poisin_Data']['score']

self.proof_response.uniqueness = Uniqueness(input_data, self.aws_access_key_id, self.aws_secret_access_key)

return self.proof_response

def Quality(data_list: List[Dict[str, Any]], aws_access_key_id: str, aws_secret_access_key: str) -> float:
#all tests
#average time taken is less than 5 seconds
#time correlates to time
#distribution in times taken
#anwsering repeat questions the same way,Check for duplicate uniqueIDs with different 'chosen' values
#choosing option 1 and option 2, Analyze the distribution of 'chosen' values
#Check for model bias in 'chosen' responses, might be dumb we expect 70b to do better than 7b so will not be even distribution
#Perform randomness test using Chi-squared test, make sure there is some distribution in what gets chosen
#Check if it is poisoned data and make sure that they chose the same response
# 8 seperate tests
report = {
'Time_Minimums':Time_Minimums(data_list),
'Time_Correlation':Character_Timing(data_list),
'Time_Distribution':Time_Distribution(data_list),
'Repeat_Anwsers':Duplicate_ID_Check(data_list),
'Both_Sides':Choice_Distribution(data_list),
'Model_Distribution':Model_Bias(data_list),
'Poisin_Data':Poison_Consistency(data_list, aws_access_key_id, aws_secret_access_key),
'score':0
}
report['score'] = sum(test_weights[test] * report[test]['score'] for test in test_weights)
print(report)
display_report(report)
return report

def Uniqueness(data_list: List[Dict[str, Any]], aws_access_key_id: str, aws_secret_access_key: str) -> float:
hash_manager = HashManager(bucket_name="vanatensordlp", remote_file_key="verified_hashes/hashes.json", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
hash_exists = hash_manager.generate_hash(data_list) in hash_manager.get_remote_hashes()
if hash_exists:
return 0.0
else:
hash_manager.add_hash(hash_manager.generate_hash(data_list))
return 1.0

def display_report(report: dict) -> None:
console = Console()

# Create main score table
main_score = Table(title="[bold magenta]Quality Assessment Report[/bold magenta]",
show_header=True,
header_style="bold cyan")
main_score.add_column("Overall Score", justify="center", style="bold")
main_score.add_row(f"{report['score']:.2%}")

# Create detailed results table
results = Table(show_header=True, header_style="bold cyan",
title="[bold magenta]Detailed Test Results[/bold magenta]")
results.add_column("Test", style="bold green")
results.add_column("Score", justify="center")
results.add_column("Status", justify="center")
results.add_column("Details", justify="left")

# Test result emojis
PASS = "✅"
PARTIAL = "⚠️"
FAIL = "❌"

# Mapping of score ranges to status
def get_status(score):
if score >= 0.8: return (PASS, "green")
if score >= 0.4: return (PARTIAL, "yellow")
return (FAIL, "red")

# Add each test result
for test_name, data in report.items():
if test_name == 'score':
continue

score = data['score']
status_emoji, color = get_status(score)

# Format comments as a single string with line breaks
comments = '\n'.join(data['comments'])

results.add_row(
test_name.replace('_', ' '),
f"[{color}]{score:.2%}[/{color}]",
status_emoji,
comments
)

# Print the report
console.print()
console.print(main_score, justify="center")
console.print()
console.print(results)
console.print()

# Add a summary footer
if report['score'] >= 0.8:
console.print("[bold green]Overall Assessment: EXCELLENT[/bold green]", justify="center")
elif report['score'] >= 0.6:
console.print("[bold yellow]Overall Assessment: GOOD[/bold yellow]", justify="center")
elif report['score'] >= 0.4:
console.print("[bold yellow]Overall Assessment: FAIR[/bold yellow]", justify="center")
else:
console.print("[bold red]Overall Assessment: NEEDS IMPROVEMENT[/bold red]", justify="center")

def fetch_random_number() -> float:
"""Demonstrate HTTP requests by fetching a random number from random.org."""
Expand Down
Loading

0 comments on commit 2c05210

Please sign in to comment.