mirror of
https://github.com/sb745/NyaaV3.git
synced 2025-03-12 22:06:55 +02:00

Since the main display_name field ngrams words up to 15 characters, anything to and under that will already be indexed - the fullword field (which we have for words longer than 15 characters) needs to index only words longer than that.
129 lines
No EOL
3.3 KiB
YAML
129 lines
No EOL
3.3 KiB
YAML
---
|
|
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
|
|
# fo inline comments.
|
|
settings:
|
|
analysis:
|
|
analyzer:
|
|
my_search_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- standard
|
|
- lowercase
|
|
my_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- resolution
|
|
- lowercase
|
|
- my_ngram
|
|
- word_delimit
|
|
- trim_zero
|
|
# For matching full words longer than the ngram limit (15 chars)
|
|
my_fullword_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- lowercase
|
|
- word_delimit
|
|
# Skip tokens shorter than N characters,
|
|
# since they're already indexed in the main field
|
|
- fullword_min
|
|
|
|
filter:
|
|
my_ngram:
|
|
type: edgeNGram
|
|
min_gram: 1
|
|
max_gram: 15
|
|
fullword_min:
|
|
type: length
|
|
# Remember to change this if you change the max_gram below!
|
|
min: 16
|
|
resolution:
|
|
type: pattern_capture
|
|
patterns: ["(\\d+)[xX](\\d+)"]
|
|
trim_zero:
|
|
type: pattern_capture
|
|
patterns: ["0*([0-9]*)"]
|
|
word_delimit:
|
|
type: word_delimiter
|
|
preserve_original: true
|
|
split_on_numerics: false
|
|
char_filter:
|
|
my_char_filter:
|
|
type: mapping
|
|
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
|
|
index:
|
|
# we're running a single es node, so no sharding necessary,
|
|
# plus replicas don't really help either.
|
|
number_of_shards: 1
|
|
number_of_replicas : 0
|
|
mapper:
|
|
# disable elasticsearch's "helpful" autoschema
|
|
dynamic: false
|
|
# since we disabled the _all field, default query the
|
|
# name of the torrent.
|
|
query:
|
|
default_field: display_name
|
|
mappings:
|
|
torrent:
|
|
# don't want everything concatenated
|
|
_all:
|
|
enabled: false
|
|
properties:
|
|
id:
|
|
type: long
|
|
display_name:
|
|
# TODO could do a fancier tokenizer here to parse out the
|
|
# the scene convention of stuff in brackets, plus stuff like k-on
|
|
type: text
|
|
analyzer: my_index_analyzer
|
|
fielddata: true # Is this required?
|
|
fields:
|
|
# Multi-field for full-word matching (when going over ngram limits)
|
|
# Note: will have to be queried for, not automatic
|
|
fullword:
|
|
type: text
|
|
analyzer: my_fullword_index_analyzer
|
|
created_time:
|
|
type: date
|
|
# Only in the ES index for generating magnet links
|
|
info_hash:
|
|
enabled: false
|
|
filesize:
|
|
type: long
|
|
anonymous:
|
|
type: boolean
|
|
trusted:
|
|
type: boolean
|
|
remake:
|
|
type: boolean
|
|
complete:
|
|
type: boolean
|
|
hidden:
|
|
type: boolean
|
|
deleted:
|
|
type: boolean
|
|
has_torrent:
|
|
type: boolean
|
|
download_count:
|
|
type: long
|
|
leech_count:
|
|
type: long
|
|
seed_count:
|
|
type: long
|
|
comment_count:
|
|
type: long
|
|
# these ids are really only for filtering, thus keyword
|
|
uploader_id:
|
|
type: keyword
|
|
main_category_id:
|
|
type: keyword
|
|
sub_category_id:
|
|
type: keyword |