-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
118 lines (103 loc) · 5.28 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import logging
from argparse import ArgumentParser, Namespace
from pathlib import Path
from src import commands
from src.methods import clemencio, transformer
def parse_args() -> Namespace:
parser = ArgumentParser()
subparsers = parser.add_subparsers(dest='command')
parser.add_argument('--verbose', '-v',
action='count', default=0,
help='Print information and debugging messages')
# preprocess
parser_preprocess = subparsers.add_parser('preprocess')
parser_preprocess.add_argument('--input', '-i',
help='Corpus file in JSON format.',
required=True, type=Path)
parser_preprocess.add_argument('--output', '-o',
help='File path to save preprocessed data in JSON format.',
required=False, type=Path)
parser_preprocess.set_defaults(command=commands.preprocess)
# feature_extraction
parser_feature = subparsers.add_parser('feature-extraction',
aliases=['feat'])
parser_feature.add_argument('--input', '-i',
help='Preprocessed corpus in JSON format.',
required=True, type=Path)
parser_feature.add_argument('--output', '-o',
help='Directory path to save count models and feature matrix.',
required=False, type=Path)
parser_feature.add_argument('--tfidf',
help='Flag to use TF-IDF counts',
required=False, action='store_true',
default=False)
parser_feature.add_argument('--max_tfidf',
help='Maximum number of TF-IDF features to consider',
required=False, type=int,
default=1000)
parser_feature.add_argument('--vectorizer',
help='Path to the pickled TfIdfVectorizer to use for TF-IDF counts',
required=False, type=Path,
default=None)
parser_feature.add_argument('--ngram', '-n',
help='Which n-gram configuration to use to calculate the TF-IDF counts',
required=False, type=str,
choices=['1', '2', '3',
'1+2', '2+3',
'1+2+3'],
default='1+2+3')
parser_feature.add_argument('--sentlex',
help='Path to the sentiment lexicon in JSON format.',
required=False, type=Path,
default=None)
parser_feature.add_argument('--slang',
help='Path to the slang lexicon in JSON format.',
required=False, type=Path,
default=None)
parser_feature.add_argument('--alliteration',
help='Flag to use alliteration features',
required=False, action='store_true',
default=False)
parser_feature.add_argument('--antonym',
help='Path to the antonym triples lexicon in JSON format.',
required=False, type=Path,
default=None)
parser_feature.add_argument('--embeddings',
help='Path to the word embeddings file in Gensim format.',
required=False, type=Path,
default=None)
parser_feature.add_argument('--mwp',
help=('Path to the Minho World Pool lexicon for imageability '
'and concreteness features in JSON format.'),
required=False, type=Path,
default=None)
parser_feature.add_argument('--ner',
help='Flag to use NER features',
required=False, action='store_true',
default=False)
parser_feature.add_argument('--ambiguity',
help='Flag to use ambiguity features from OpenWordNet-PT',
required=False, action='store_true',
default=False)
parser_feature.set_defaults(command=commands.feature_extraction)
# Method parsers
clemencio.add_parser(subparsers)
transformer.add_parser(subparsers)
return parser.parse_args()
def config_logger(verbose_level: int):
global logger
logger = logging.getLogger('HumorRecognitionPT')
ch = logging.StreamHandler()
if verbose_level == 1:
logger.setLevel(logging.INFO)
elif verbose_level >= 1:
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
def main(args):
args.command(args)
if __name__ == '__main__':
args = parse_args()
config_logger(args.verbose)
main(args)