fix and update procs using external resources

sociocom · Mar 13, 2023 · 1655e9b · 1655e9b
1 parent 9d21bda
commit 1655e9b
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -39,8 +39,8 @@ Specify the paths to the resources with the following options.
 次のオプションでリソースへのパスを指定してください。
 
 - `--sw`: Japanese stopwords (1 行 1 単語＝原形の形式のテキストファイル)
-- `--awd`: [日本語抽象度辞書 AWD-J](https://sociocom.naist.jp/awd-j/)
-- `--jiwc`: [日本語感情表現辞書 JIWC](https://sociocom.naist.jp/jiwc-dictionary/)
+- `--awd`: [日本語抽象度辞書 AWD-J](https://sociocom.naist.jp/awd-j/) `-EX` データを使用してください (e.g. [`AWD-J_EX.txt`](http://sociocom.jp/~data/2019-AWD-J/data/AWD-J_EX.txt))
+- `--jiwc`: [日本語感情表現辞書 JIWC](https://sociocom.naist.jp/jiwc-dictionary/) `-A` データを使用してください (e.g. [`JIWC-A_2019.csv`](https://github.com/sociocom/JIWC-Dictionary/blob/master/ver_2019/JIWC-A_2019.csv))
 
 ## Linguistic Measures / 言語指標
 

diff --git a/limco.py b/limco.py
@@ -1,5 +1,6 @@
 import re
 import unicodedata as ud
+import warnings
 from collections import Counter
 from typing import Optional, Union
 
@@ -137,7 +138,7 @@ def measure_pos(
     res.update(calc_ttrs(all_tokens))
     if awd:
         res.update(score_abstractness(all_tokens, awd))
-    if jiwc:
+    if jiwc is not None:
         res.update(score_jiwc(all_tokens, jiwc))
 
     return res
@@ -164,12 +165,15 @@ def calc_ttrs(tokens: list[str]) -> dict[str, Num]:
     }
 
 
-def score_abstractness(tokens: list[str], awd: dict[str, float]) -> dict[str, float]:
-    scores = [awd.get(token, 0.0) for token in tokens]
-    return {
-        "abst_top5_mean": np.mean(sorted(scores, reverse=True)[:5]),
-        "abst_max": max(scores),
-    }
+def score_abstractness(tokens: list[str], awds: dict[str, float]) -> dict[str, float]:
+    scores = [awds[token] for token in tokens if token in awds]
+    if scores:
+        return {
+            "abst_top5_mean": np.mean(sorted(scores, reverse=True)[:5]),
+            "abst_max": max(scores),
+        }
+    else:
+        return {"abst_top5_mean": np.nan, "abst_max": np.nan}
 
 
 def score_jiwc(tokens: list[str], df_jiwc: pd.DataFrame) -> dict[str, float]:
@@ -179,21 +183,7 @@ def score_jiwc(tokens: list[str], df_jiwc: pd.DataFrame) -> dict[str, float]:
     """
     jiwc_words = list(set(tokens) & set(df_jiwc.index))
     jiwc_vals = df_jiwc.loc[jiwc_words].sum()
-    return (
-        (jiwc_vals / jiwc_vals.sum())
-        .rename(
-            {
-                "Sad": "jiwc_sadness",
-                "Anx": "jiwc_anxiety",
-                "Anger": "jiwc_anger",
-                "Hate": "jiwc_hatrid",
-                "Trustful": "jiwc_trust",
-                "S": "jiwc_surprise",
-                "Happy": "jiwc_happiness",
-            }
-        )
-        .to_dict()
-    )
+    return (jiwc_vals / jiwc_vals.sum()).to_dict()
 
 
 def count_taigendome(doc: spacy.tokens.Doc) -> int:
@@ -355,13 +345,22 @@ def from_file(
 
     if awd:
         with open(awd, "r") as f:
+            #  AWD TSV format: word, score, deviation, pos
             rows = [line.strip().split("\t") for line in f]
+            rows.pop(0)  # remove header
             awds = {word: float(score) for word, score, _, _ in rows}
     else:
         awds = {}
 
     if jiwc:
-        df_jiwc = pd.read_csv(jiwc, index_col=1).drop(columns="Unnamed: 0")
+        df_jiwc = (
+            pd.read_csv(jiwc)
+            .set_index("Words")
+            .rename(
+                columns=lambda x: "jiwc_" + x.lower(),
+            )
+        )
+
     else:
         df_jiwc = None
 
@@ -374,7 +373,9 @@ def from_file(
 
 
 def main():
-    fire.Fire(from_file)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        fire.Fire(from_file)
 
 
 if __name__ == "__main__":