diff --git a/analyse_parseddoc.py b/analyse_parseddoc.py deleted file mode 100644 index e063613..0000000 --- a/analyse_parseddoc.py +++ /dev/null @@ -1,152 +0,0 @@ -# import numpy as np -from typing import List -import fire -import pandas as pd - - -class Chunk: - def __init__(self): - self.morphs = [] - self.dst = -1 - self.srcs = [] - - # def print_all(self): - # return self.morphs + "\t" + self.dst + ", " + self.srcs - - def __repr__(self): - if self.morphs: - surfs = [morph.surface for morph in self.morphs if morph.pos != "記号"] - return "".format("|".join(surfs)) - else: - return "" - - def include_pos(self, pos): - return pos in [morph.pos for morph in self.morphs] - - def morphs_of_pos(self, pos): - return [morph for morph in self.morphs if morph.pos == pos] - - def morphs_of_pos1(self, pos1): - return [morph for morph in self.morphs if morph.pos1 == pos1] - - -class Morph: - def __init__(self, surface, base, pos, pos1): - self.surface = surface - self.base = base - self.pos = pos - self.pos1 = pos1 - - def __repr__(self): - return "".format( - self.surface + "\t" + self.base + ", " + self.pos + ", " + self.pos1 - ) - - -def read_chunks(cabochafile): - sentences = [] - sentence: List[Chunk] = [] - for line in cabochafile: - if line == "EOS\n": - for i, c in enumerate(sentence[:-1]): - if c.dst != -1: - sentence[c.dst].srcs.append(i) - # 係り元は再帰的にとらない - - sentences.append(sentence) - # shallow/deep copy - # del sentence[:] # 参照 - sentence = [] - elif line[0] == "*": - # sentence.append(line) - # if len(sentence) > 0: - # sentence.append(chunk) - - chunk = Chunk() - chunk.dst = int(line.split(" ")[2].strip("D")) - sentence.append(chunk) - else: - surface, feature = line.split("\t") - features = feature.split(",") - morph = Morph(surface, features[6], features[0], features[1]) - sentence[-1].morphs.append(morph) - - return sentences - - -def count_chunk_depth(ix, sentchunk): - if sentchunk[ix].srcs: - return max( - [count_chunk_depth(src, sentchunk) + 1 for src in sentchunk[ix].srcs] - ) - else: - return 0 - - -def count_sent_depth(sentchunk): - if len(sentchunk) == 0: - return 0 - else: - root_i = [c.dst for c in sentchunk].index(-1) - return count_chunk_depth(root_i, sentchunk) - - -def analyse_dep(cfname: str, fname: str = None) -> None: - """Apply dependency tree analyses and concat the original data""" - with open(cfname, "r") as f: - chunk_sents = read_chunks(f) - - docs = [] - doc = [] - for chunk_sent in chunk_sents: - if chunk_sent: - doc.append(chunk_sent) - else: - docs.append(doc) - doc = [] - - sr_depths = [] - sr_leaves = [] - sr_chunklen = [] - for doc in docs: - depths = [count_sent_depth(sentchunk) for sentchunk in doc] - sr_depths.append(pd.Series(depths).describe().to_frame().T) - - n_leaves = [len(sentchunk) for sentchunk in doc] - sr_leaves.append(pd.Series(n_leaves).describe().to_frame().T) - - chunklen = [len(chunk.morphs) for sentchunk in doc for chunk in sentchunk] - sr_chunklen.append(pd.Series(chunklen).describe().to_frame().T) - - # 構文木の深さ - df_sdep = ( - pd.concat(sr_depths) - .reset_index(drop=True) - .rename(columns=lambda x: f"sdep_{x}") - ) - # 構文木の葉の数(文節数) - df_nleaf = ( - pd.concat(sr_leaves) - .reset_index(drop=True) - .rename(columns=lambda x: f"nleaf_{x}") - ) - # 文節の長さ(形態素数) - df_chklen = ( - pd.concat(sr_chunklen) - .reset_index(drop=True) - .rename(columns=lambda x: f"chklen_{x}") - ) - - if fname: - df = pd.read_csv(fname) - assert len(df) == len(df_sdep) - pd.concat([df, df_sdep, df_nleaf, df_chklen], axis=1).to_csv( - f"{fname}.parsed.csv", index=False - ) - pd.concat([df_sdep, df_nleaf, df_chklen], axis=1).to_csv( - f"{cfname}.parsed.csv", index=False - ) - - -if __name__ == "__main__": - fire.Fire(analyse_dep) diff --git a/measure_lang.py b/limco.py similarity index 100% rename from measure_lang.py rename to limco.py diff --git a/test_measure_lang.py b/test_limco.py similarity index 80% rename from test_measure_lang.py rename to test_limco.py index ec8fcd1..f800986 100644 --- a/test_measure_lang.py +++ b/test_limco.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -import measure_lang as ml +import limco TEXT = """ここは駅から程よい距離にある日本の住宅街である。 「びっくり!」 @@ -11,27 +11,27 @@ しかも、街の北にある中学校でさえも2クラスになろうとしている。『結構小さい街なんだね。』 だが、いつもこの街にあるスーパーで毎年行われる納涼祭はとても盛り上がり、この街だけではおさまらず、他のところから来ている人も多数いる。 """ -DOC = ml.NLP(ml.normalise(TEXT)) +DOC = limco.NLP(limco.normalise(TEXT)) def test_count_charcat(): text = "あれとコレと竜巻." - assert ml.count_charcat(text) == {"hiragana": 4, "katakana": 2, "kanji": 2} + assert limco.count_charcat(text) == {"hiragana": 4, "katakana": 2, "kanji": 2} def test_count_conversations(): - assert ml.count_conversations(TEXT) == {"single": 1, "double": 1} + assert limco.count_conversations(TEXT) == {"single": 1, "double": 1} def test_describe_sentence_lengths(): - assert list(ml.describe_sentence_lengths(DOC).values()) == pytest.approx( + assert list(limco.describe_sentence_lengths(DOC).values()) == pytest.approx( [31.857143, 21.341888, 7.0, 18.5, 28.0, 42.0, 67.0] ) def test_calc_ttrs(): # FIXME: 数式との一致を確認する - assert ml.calc_ttrs( + assert limco.calc_ttrs( ["今日", "明日", "月曜日", "明るい", "明るい", "今日"] ).values() == pytest.approx( [ @@ -57,7 +57,7 @@ def test_score_abstractness(): "納涼祭": 1.0, } assert list( - ml.score_abstractness(list(awd.keys()) + ["明日", "今日"], awd).values() + limco.score_abstractness(list(awd.keys()) + ["明日", "今日"], awd).values() ) == pytest.approx([2.64, 5.0]) @@ -70,11 +70,11 @@ def test_score_jiwc(): ], columns=["word", "Sad", "Anx", "Anger", "hate", "Trustful", "S", "Happy"], ).set_index("word") - res = list(ml.score_jiwc(["明日", "感謝", "大きい"], df_jiwc).values()) + res = list(limco.score_jiwc(["明日", "感謝", "大きい"], df_jiwc).values()) assert res == pytest.approx( [0.04784689, 0.0, 0.0, 0.09569377, 0.526316, 0.0, 0.3301435] ) def test_count_taigendome(): - assert ml.count_taigendome(DOC) == 1 + assert limco.count_taigendome(DOC) == 1