From 6ffd616280d99e016bb62ef7b1d0bc4c29cbc3ab Mon Sep 17 00:00:00 2001 From: pramitchoudhary Date: Thu, 18 May 2023 15:18:11 -0700 Subject: [PATCH] Further adjust the prompts #1 --- sidekick/configs/prompt_template.py | 44 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/sidekick/configs/prompt_template.py b/sidekick/configs/prompt_template.py index 8fc74c3..ef3dd74 100644 --- a/sidekick/configs/prompt_template.py +++ b/sidekick/configs/prompt_template.py @@ -1,16 +1,16 @@ TASK_PROMPT = { "system_prompt": "Act as a Data Analyst", "user_prompt": """ - ### For Table: {_table_name} Given an input *Question*, only return specific and informative tasks as an ordered itemized list for SQL generation that answer the question. - Extract all of the proper nouns (generally capitalized, abbreviated) from the Samples section and add to Context section as Key, Value pair. - Use the Context section and Samples section to establish relationship when tokens from Question does not match column names. - If information is not found in Context or Samples section, attempt to reason for possible tasks but also ask questions for. - Infer the return type of the Question. Do not generate final SQL response, only return tasks. - # Data information: \n{_data_info} - # Samples: \n{_sample_queries} - # Context: {_context} - # *Question*: {_question_str}; - # Output: Tasks: ordered list of tasks + ### Given an input *Question*, only return specific and informative tasks as an ordered numeric list for SQL generation that answer the question. + Use the *History* and *Context* section for co-reference and to infer relationships. + If the words in the *Question* do not match column names *Data* section; Search for them in *Context* section. + Always use *Context* with highest similarity score with the *Question*. + If no information related to the *Question* is found; attempt to predict and reason for possible tasks. + Infer the return type of the Question. Do not generate final complete SQL response, only return tasks. + # *Data:* \nFor table {_table_name} schema info is mentioned below,\n{_data_info} + # *History*: \n{_sample_queries} + # *Question*: For Table {_table_name}, {_question_str}, *Context*: {_context} + # Output: Tasks: ordered numeric list of tasks """, } @@ -21,23 +21,21 @@ # Reference: https://arxiv.org/pdf/2005.14165.pdf QUERY_PROMPT = """ ### System: Act as a SQL Expert - # Given an input *Question*, only generate syntactically correct SQL queries using step by step reasoning from Tasks section. - # Extract all of the proper nouns (generally capitalized, abbreviated) from the Examples section and add to Context section as Key, Value pair. - # Use the context section to establish relationship when tokens from Question does not match column names. + # Given an input *Question*, only generate syntactically correct SQL queries using step by step reasoning from *Tasks* section. # Pick the SQL query which has the highest average log probability of explaining the - candidate question. + candidate *Question*. ### {dialect} SQL tables - Examples:\n{_sample_queries} - ### *Question*: {_question}; + ### *History*:\n{_sample_queries} + ### *Question*: {_question} # SELECT 1 - ### Tasks:\n{_tasks} - ### Context: {_context} - ### Suggestions: - # Don't use aggregate and window function together; - # Avoid COUNT(*) and prefer COUNT(1); + ### *Tasks*:\n{_tasks} + ### *Policies for SQL generation*: + # Avoid overly complex SQL queries + # Don't use aggregate and window function together + # Use COUNT(1) instead of COUNT(*) # Return with LIMIT 100 - # Prefer NOT EXISTS to LEFT JOIN ON null id; - # Avoid using the WITH statement; + # Prefer NOT EXISTS to LEFT JOIN ON null id + # Avoid using the WITH statement # When using DESC keep NULLs at the end # If JSONB format found in Table schema, do pattern matching on keywords from the question # Add explanation