-
Notifications
You must be signed in to change notification settings - Fork 358
/
Copy pathes_mapping.yml
145 lines (143 loc) · 3.84 KB
/
es_mapping.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
# fo inline comments.
settings:
analysis:
analyzer:
my_search_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- lowercase
my_index_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- resolution
- lowercase
- word_delimit
- my_ngram
- trim_zero
- unique
# For exact matching - separate each character for substring matching + lowercase
exact_analyzer:
tokenizer: exact_tokenizer
filter:
- lowercase
# For matching full words longer than the ngram limit (15 chars)
my_fullword_index_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- lowercase
- word_delimit
# Skip tokens shorter than N characters,
# since they're already indexed in the main field
- fullword_min
- unique
tokenizer:
# Splits input into characters, for exact substring matching
exact_tokenizer:
type: pattern
pattern: "(.)"
group: 1
filter:
my_ngram:
type: edge_ngram
min_gram: 1
max_gram: 15
fullword_min:
type: length
# Remember to change this if you change the max_gram below!
min: 16
resolution:
type: pattern_capture
patterns: ["(\\d+)[xX](\\d+)"]
trim_zero:
type: pattern_capture
patterns: ["0*([0-9]*)"]
word_delimit:
type: word_delimiter_graph
preserve_original: true
split_on_numerics: false
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms
# since we're using "trim" filters downstream, otherwise
# you get weird lucene errors about startOffset
adjust_offsets: false
char_filter:
my_char_filter:
type: mapping
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
index:
# we're running a single es node, so no sharding necessary,
# plus replicas don't really help either.
number_of_shards: 1
number_of_replicas : 0
query:
default_field: display_name
mappings:
# disable elasticsearch's "helpful" autoschema
dynamic: false
properties:
id:
type: long
display_name:
# TODO could do a fancier tokenizer here to parse out the
# the scene convention of stuff in brackets, plus stuff like k-on
type: text
analyzer: my_index_analyzer
fielddata: true # Is this required?
fields:
# Multi-field for full-word matching (when going over ngram limits)
# Note: will have to be queried for, not automatic
fullword:
type: text
analyzer: my_fullword_index_analyzer
# Stored for exact phrase matching
exact:
type: text
analyzer: exact_analyzer
created_time:
type: date
#
# Only in the ES index for generating magnet links
info_hash:
type: keyword
index: false
filesize:
type: long
anonymous:
type: boolean
trusted:
type: boolean
remake:
type: boolean
complete:
type: boolean
hidden:
type: boolean
deleted:
type: boolean
has_torrent:
type: boolean
download_count:
type: long
leech_count:
type: long
seed_count:
type: long
comment_count:
type: long
# these ids are really only for filtering, thus keyword
uploader_id:
type: keyword
main_category_id:
type: keyword
sub_category_id:
type: keyword