Skip to content

Commit

Permalink
fix and update procs using external resources
Browse files Browse the repository at this point in the history
  • Loading branch information
shuntaroy committed Mar 13, 2023
1 parent 9d21bda commit 1655e9b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 26 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ Specify the paths to the resources with the following options.
次のオプションでリソースへのパスを指定してください。

- `--sw`: Japanese stopwords (1 行 1 単語=原形の形式のテキストファイル)
- `--awd`: [日本語抽象度辞書 AWD-J](https://sociocom.naist.jp/awd-j/)
- `--jiwc`: [日本語感情表現辞書 JIWC](https://sociocom.naist.jp/jiwc-dictionary/)
- `--awd`: [日本語抽象度辞書 AWD-J](https://sociocom.naist.jp/awd-j/) `-EX` データを使用してください (e.g. [`AWD-J_EX.txt`](http://sociocom.jp/~data/2019-AWD-J/data/AWD-J_EX.txt))
- `--jiwc`: [日本語感情表現辞書 JIWC](https://sociocom.naist.jp/jiwc-dictionary/) `-A` データを使用してください (e.g. [`JIWC-A_2019.csv`](https://github.com/sociocom/JIWC-Dictionary/blob/master/ver_2019/JIWC-A_2019.csv))

## Linguistic Measures / 言語指標

Expand Down
49 changes: 25 additions & 24 deletions limco.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import unicodedata as ud
import warnings
from collections import Counter
from typing import Optional, Union

Expand Down Expand Up @@ -137,7 +138,7 @@ def measure_pos(
res.update(calc_ttrs(all_tokens))
if awd:
res.update(score_abstractness(all_tokens, awd))
if jiwc:
if jiwc is not None:
res.update(score_jiwc(all_tokens, jiwc))

return res
Expand All @@ -164,12 +165,15 @@ def calc_ttrs(tokens: list[str]) -> dict[str, Num]:
}


def score_abstractness(tokens: list[str], awd: dict[str, float]) -> dict[str, float]:
scores = [awd.get(token, 0.0) for token in tokens]
return {
"abst_top5_mean": np.mean(sorted(scores, reverse=True)[:5]),
"abst_max": max(scores),
}
def score_abstractness(tokens: list[str], awds: dict[str, float]) -> dict[str, float]:
scores = [awds[token] for token in tokens if token in awds]
if scores:
return {
"abst_top5_mean": np.mean(sorted(scores, reverse=True)[:5]),
"abst_max": max(scores),
}
else:
return {"abst_top5_mean": np.nan, "abst_max": np.nan}


def score_jiwc(tokens: list[str], df_jiwc: pd.DataFrame) -> dict[str, float]:
Expand All @@ -179,21 +183,7 @@ def score_jiwc(tokens: list[str], df_jiwc: pd.DataFrame) -> dict[str, float]:
"""
jiwc_words = list(set(tokens) & set(df_jiwc.index))
jiwc_vals = df_jiwc.loc[jiwc_words].sum()
return (
(jiwc_vals / jiwc_vals.sum())
.rename(
{
"Sad": "jiwc_sadness",
"Anx": "jiwc_anxiety",
"Anger": "jiwc_anger",
"Hate": "jiwc_hatrid",
"Trustful": "jiwc_trust",
"S": "jiwc_surprise",
"Happy": "jiwc_happiness",
}
)
.to_dict()
)
return (jiwc_vals / jiwc_vals.sum()).to_dict()


def count_taigendome(doc: spacy.tokens.Doc) -> int:
Expand Down Expand Up @@ -355,13 +345,22 @@ def from_file(

if awd:
with open(awd, "r") as f:
# AWD TSV format: word, score, deviation, pos
rows = [line.strip().split("\t") for line in f]
rows.pop(0) # remove header
awds = {word: float(score) for word, score, _, _ in rows}
else:
awds = {}

if jiwc:
df_jiwc = pd.read_csv(jiwc, index_col=1).drop(columns="Unnamed: 0")
df_jiwc = (
pd.read_csv(jiwc)
.set_index("Words")
.rename(
columns=lambda x: "jiwc_" + x.lower(),
)
)

else:
df_jiwc = None

Expand All @@ -374,7 +373,9 @@ def from_file(


def main():
fire.Fire(from_file)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fire.Fire(from_file)


if __name__ == "__main__":
Expand Down

0 comments on commit 1655e9b

Please sign in to comment.