diff --git a/wren-ai-service/eval/metrics/llm/__init__.py b/wren-ai-service/eval/metrics/llm/__init__.py
index 4aac415ee..068247d25 100644
--- a/wren-ai-service/eval/metrics/llm/__init__.py
+++ b/wren-ai-service/eval/metrics/llm/__init__.py
@@ -31,14 +31,15 @@ def format(response: dict) -> EvalResult:
 
 class QuestionCoherenceJudge(BaseMetric):
     _system_prompt = """
-    You are an expert evaluator. Your task is to analyze the reasoning provided for a given question and determine if it makes sense. Provide a score and a detailed explanation for your evaluation.
+    You are an expert evaluator. Your task is to analyze the reasoning provided for a given question and determine if it makes sense. 
+    Provide a score in the range 0.0~1.0 and a detailed explanation for your evaluation.
     """
     _test_case_prompt = """
     Question: 
-    {question}
-
+    {{ question }}
+    
     Reasoning:
-    {reasoning}
+    {{ reasoning }}
     """
 
     def __init__(self, llm_provider: LLMProvider, **_):
@@ -78,14 +79,15 @@ def __name__(self):
 
 class ReasoningValidityJudge(BaseMetric):
     _system_prompt = """
-    You are an expert evaluator. Your task is to analyze the reasoning provided for a given SQL query and determine if it makes sense. Provide a score and a detailed explanation for your evaluation.
+    You are an expert evaluator. Your task is to analyze the reasoning provided for a given SQL query and determine if it makes sense. 
+    Provide a score in the range 0.0~1.0 and a detailed explanation for your evaluation.
     """
     _test_case_prompt = """
     Actual Output: 
-    {actual_output}
+    {{ actual_output }}
 
     Reasoning:
-    {reasoning}
+    {{ reasoning }}
     """
 
     def __init__(self, llm_provider: LLMProvider, **_):
@@ -125,14 +127,15 @@ def __name__(self):
 
 class SqlSemanticsJudge(BaseMetric):
     _system_prompt = """
-    You are an expert evaluator. Your task is to analyze the actual SQL query and the expected SQL query and determine if they are semantically equivalent. Provide a score and a detailed explanation for your evaluation.
+    You are an expert evaluator. Your task is to analyze the actual SQL query and the expected SQL query and determine if they are semantically equivalent. 
+    Provide a score in the range 0.0~1.0 and a detailed explanation for your evaluation.
     """
     _test_case_prompt = """
     Actual SQL: 
-    {actual_sql}
+    {{ actual_sql }}
 
     Expected SQL: 
-    {expected_sql}
+    {{ expected_sql }}
     """
 
     def __init__(self, llm_provider: LLMProvider, **_):