-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
161 lines (123 loc) · 6.09 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
"""Process
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1Q1UxrmNHSKvNRqlksj7Lf0sObE5Zeq74
"""
import psycopg2
import json
import os
from dotenv import load_dotenv
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Constants
MODEL_NAME = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
MODEL_PATH = hf_hub_download(repo_id=MODEL_NAME, filename="mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf")
BATCH_SIZE = 10 # Number of records to process in each batch
TIMEOUT = 5000 # Statement timeout in milliseconds
# Load Llama model
model = Llama(
model_path=MODEL_PATH,
n_threads=64,
n_batch=128,
n_ctx=4096,
n_gpu_layers=96,
mlock=True,
logits_all=True
)
# Load environment variables
load_dotenv('.env')
# Establish database connection
def create_db_connection():
return psycopg2.connect(
dbname=os.getenv("DB_NAME"),
user=os.getenv("DB_USER"),
password=os.getenv("DB_PASSWORD"),
host=os.getenv("DB_HOST"),
port=os.getenv("DB_PORT"),
connect_timeout=10 # Set connection timeout to 10 seconds
)
# Set statement timeout for long-running queries
def set_statement_timeout(cursor, timeout=TIMEOUT):
cursor.execute(f"SET statement_timeout = {timeout};")
# Read starting ID from environment
def read_starting_id():
return int(os.getenv("STARTING_ID", 0))
# Save updated ID to the .env file
def save_updated_id(id_value):
with open('.env', 'r') as file:
lines = file.readlines()
updated_lines = [
f"STARTING_ID={id_value}\n" if line.startswith("STARTING_ID=") else line
for line in lines
]
if not any(line.startswith("STARTING_ID=") for line in lines):
updated_lines.append(f"STARTING_ID={id_value}\n")
with open('.env', 'w') as file:
file.writelines(updated_lines)
# Process a triple and sentence using Llama model
def process_triple(triple, sentence):
prompt=f"""
Context:
USER: (''Is the triple "Phase related to Follicle stimulating hormone measurement" directly or indirectly supported by the sentence: "In pre-menopause healthy females, blood was sampled weekly during one menstruation cycle and menstruation phases (follicular, ovulatory, luteal) were determined by FSH/LH levels."?',)
ASSISTANT: Yes
USER: (''Is the triple "Phase related to Sodium measurement" directly or indirectly supported by the sentence: "Based on a biophysical photoreceptor model, the Na(+)- and Ca(2+)-currents and concentration changes were determined from the first transient depolarization phase of the photoreceptor response."?',)
ASSISTANT:Yes
USER: (''Is the triple "Phase related to Bronchoalveolar Lavage" directly or indirectly supported by the sentence: "Challenge of the airways of sensitized guinea pigs with aerosolized ovalbumin resulted in an early phase of microvascular protein leakage and a delayed phase of eosinophil accumulation in the airway lumen, as measured using bronchoalveolar lavage (BAL)."?',)
ASSISTANT: Yes
USER: (''Does the phrase "Ciprofloxacin related to DNA Gyrase" receive at least indirect support from the statement: "Effect of ranolazine in preventing postoperative atrial fibrillation in patients undergoing coronary revascularization surgery."?',)
ASSISTANT: No
USER: (''Does the phrase "Ciprofloxacin related to Crohn disease" receive at least indirect support from the statement: "Recent evidence of beneficial effects of ranolazine (RAN) in type II diabetes motivates interest in the role of the late sodium current (INaL) in glucose-stimulated insulin secretion."?',)
ASSISTANT: No
USER: (''Does the phrase "Ciprofloxacin related to endophthalmitis" receive at least indirect support from the statement: "Furthermore, the activated Akt/mTOR signaling pathway induced by AF was further activated by ranolazine."?',)
ASSISTANT: No
SYSTEM: You are a computational biologist tasked with evaluating scientific claims. Your role requires you to apply critical thinking and your expertise to interpret data and research findings accurately. Answer 'Yes' or 'No' to directly address the query posed.
USER: ('\'Does the phrase "{triple}" receive at least indirect support from the statement: "{sentence}"?',).
ASSISTANT:
"""
response = model(prompt=prompt, max_tokens=1, temperature=0, echo=False, logprobs=True)
return response["choices"][0]["text"], json.dumps(response["choices"][0]["logprobs"])
# Process batches of records from the database
def process_batches(cursor, starting_id):
while True:
cursor.execute("""
SELECT "id", "triple", "sentence"
FROM public."tblBiomedicalFactcheck"
WHERE "id" >= %s
ORDER BY "id"
LIMIT %s
""", (starting_id, BATCH_SIZE))
records = cursor.fetchall()
if not records:
print("No more records to process.")
break
for record in records:
record_id, triple, sentence = record
answer, logprops_json = process_triple(triple, sentence)
try:
cursor.execute("""
UPDATE public."tblBiomedicalFactcheck"
SET "answer" = %s, "logprops" = %s
WHERE "id" = %s
""", (answer, logprops_json, record_id))
except Exception as e:
print(f"Error updating record ID {record_id}: {e}")
conn.rollback() # Rollback in case of an error
conn.commit()
starting_id += BATCH_SIZE
save_updated_id(starting_id)
print(f"Processed up to ID: {starting_id}")
# Main processing function
def main():
starting_id = read_starting_id()
conn = create_db_connection()
cursor = conn.cursor()
try:
set_statement_timeout(cursor)
process_batches(cursor, starting_id)
finally:
cursor.close()
conn.close()
print("Data processing completed successfully.")
if __name__ == "__main__":
main()