Skip to content

Commit

Permalink
Merge pull request #38 from codeforjapan/feature/37-topic-model-poc
Browse files Browse the repository at this point in the history
Feature/37 topic model poc
  • Loading branch information
osoken authored Feb 24, 2024
2 parents 51f7684 + 43a47e8 commit 3cad33b
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,5 @@ cython_debug/
#.idea/

.vscode/settings.json

.DS_Store
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ dependencies = [
"sqlalchemy",
"pydantic_settings",
"fastapi",
"JSON-log-formatter"
"JSON-log-formatter",
"openai"
]

[project.urls]
Expand Down
84 changes: 84 additions & 0 deletions scripts/add_topic_poc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import json
import os
from argparse import ArgumentParser
from typing import Dict, List

from dotenv import load_dotenv
from openai import OpenAI

response_sample = """
{
"1700958646329": {
"topics": ["医療", "福祉", "政治"],
"language": "en"
}
}
"""

def get_topic(client: OpenAI, note_id: int, tweet: str, note: str) -> Dict[str, List[str]]:
print(f"note id: {note_id}")
with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f:
fewshot_sample = json.load(f)

chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": f"""以下はツイートと、それに追加されたコミュニティノートです。
ツイート:
```
{fewshot_sample["tweet"]}
```
コミュニティノート:
```
{fewshot_sample["note"]}
```
このセットに対してのトピックは「{" ".join(fewshot_sample["topics"])}」です。
これを踏まえて、以下のセットに対して同じ粒度で複数のトピック(少なくとも3つ)を提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。また、ツイートに用いられている言語も推定し、キーをlanguageとしてiso 639-1に準拠した言語コードを格納してください。topicsとlanguageを格納するオブジェクトはnote idをキーとした値に格納してください
レスポンスの例 (1700958646329はnote id):
```
{response_sample}
```
""",
},
{
"role": "user",
"content": f"""
note id: {note_id}
ツイート:
```
{tweet}
```
コミュニティノート:
```
{note}
```
""",
},
],
model="gpt-3.5-turbo",
temperature=0.0,
)

return json.loads(chat_completion.choices[0].message.content)

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("input_file")
parser.add_argument("output_file")
args = parser.parse_args()
load_dotenv()
client = OpenAI()
with open(args.input_file, "r") as f:
notes = json.load(f)
with open(args.output_file, "w", encoding="utf-8") as f:
json.dump(
[
get_topic(client, note["noteId"], note["tweetBody"], note["noteBody"])
for note in notes
],
f,
ensure_ascii=False,
indent=2,
)

7 changes: 7 additions & 0 deletions scripts/fewshot_sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"tweet": "For those that care — 432 hz improves mental clarity, removes emotional blockages, reduces stress and anxiety, better sleep quality, increases creativity & inspiration, and strengthens the immune system. Play it while you sleep & watch these areas improve!",
"note": "There are no placebo controlled studies which support this. There is no evidence that this frequency has different effects from any other arbitrary frequency. https://ask.audio/articles/music-theory-432-hz-tuning-separating-fact-from-fiction",
"topics": [
"医療", "福祉"
]
}

0 comments on commit 3cad33b

Please sign in to comment.