mirror of
				https://github.com/sb745/NyaaV3.git
				synced 2025-10-31 16:05:46 +02:00 
			
		
		
		
	[ES Change] Improve Elasticsearch term quoting (#473)
* Optimize Elasticsearch fullword field Since the main display_name field ngrams words up to 15 characters, anything to and under that will already be indexed - the fullword field (which we have for words longer than 15 characters) needs to index only words longer than that. * Preprocess ES terms for better literal matching This commit adds a new .exact subfield to display_name, which holds a barely-filtered version of the original title we can do "literal" matching against. This is not real substring matching, but quoting terms now actually does something! Implements a simple preprocessor for the search terms to extract quoted parts from the search terms, optionally prefixed with - to negate them. The preprocessor will create a query that'll join all three query-types: the simple_query_string, must-phrases and must-not-phrases.
This commit is contained in:
		
							parent
							
								
									8f9400bb5f
								
							
						
					
					
						commit
						0b78428abc
					
				
					 2 changed files with 91 additions and 7 deletions
				
			
		|  | @ -23,6 +23,11 @@ settings: | |||
|           - my_ngram | ||||
|           - word_delimit | ||||
|           - trim_zero | ||||
|       # For exact matching - simple lowercase + whitespace delimiter | ||||
|       exact_analyzer: | ||||
|         tokenizer: whitespace | ||||
|         filter: | ||||
|           - lowercase | ||||
|       # For matching full words longer than the ngram limit (15 chars) | ||||
|       my_fullword_index_analyzer: | ||||
|         type: custom | ||||
|  | @ -32,13 +37,19 @@ settings: | |||
|         filter: | ||||
|           - lowercase | ||||
|           - word_delimit | ||||
|           # These should be enough, as my_index_analyzer will match the rest | ||||
|           # Skip tokens shorter than N characters, | ||||
|           # since they're already indexed in the main field | ||||
|           - fullword_min | ||||
| 
 | ||||
|     filter: | ||||
|       my_ngram: | ||||
|         type: edgeNGram | ||||
|         min_gram: 1 | ||||
|         max_gram: 15 | ||||
|       fullword_min: | ||||
|         type: length | ||||
|         # Remember to change this if you change the max_gram below! | ||||
|         min: 16 | ||||
|       resolution: | ||||
|         type: pattern_capture | ||||
|         patterns: ["(\\d+)[xX](\\d+)"] | ||||
|  | @ -85,6 +96,10 @@ mappings: | |||
|           fullword: | ||||
|             type: text | ||||
|             analyzer: my_fullword_index_analyzer | ||||
|           # Stored for exact phrase matching | ||||
|           exact: | ||||
|             type: text | ||||
|             analyzer: exact_analyzer | ||||
|       created_time: | ||||
|         type: date | ||||
|         # Only in the ES index for generating magnet links | ||||
|  |  | |||
|  | @ -69,6 +69,79 @@ def _generate_query_string(term, category, filter, user): | |||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # For preprocessing ES search terms in _parse_es_search_terms | ||||
| QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') | ||||
| 
 | ||||
| 
 | ||||
| def _es_name_exact_phrase(literal): | ||||
|     ''' Returns a Query for a phrase match on the display_name for a given literal ''' | ||||
|     return Q({ | ||||
|         'match_phrase': { | ||||
|             'display_name.exact': { | ||||
|                 'query': literal, | ||||
|                 'analyzer': 'exact_analyzer' | ||||
|             } | ||||
|         } | ||||
|     }) | ||||
| 
 | ||||
| 
 | ||||
| def _parse_es_search_terms(search, search_terms): | ||||
|     ''' Parse search terms into a query with properly handled literal phrases | ||||
|         (the simple_query_string is not so great with exact results). | ||||
|         For example: | ||||
|             foo bar "hello world" -"exclude this" | ||||
|         will become a must simple_query_string for "foo bar", a must phrase_match for | ||||
|         "hello world" and a must_not for "exclude this". | ||||
|         Returns the search with the generated bool-query added to it. ''' | ||||
| 
 | ||||
|     # Literal must and must-not sets | ||||
|     must_set = set() | ||||
|     must_not_set = set() | ||||
| 
 | ||||
|     def literal_matcher(match): | ||||
|         negated = bool(match.group(1)) | ||||
|         literal = match.group(2) | ||||
| 
 | ||||
|         if negated: | ||||
|             must_not_set.add(literal) | ||||
|         else: | ||||
|             must_set.add(literal) | ||||
| 
 | ||||
|         # Remove the parsed literal from search terms | ||||
|         return '' | ||||
| 
 | ||||
|     # Remove quoted parts (optionally prepended with -) and store them in the sets | ||||
|     parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() | ||||
| 
 | ||||
|     # Create phrase matches (if any) | ||||
|     must_queries = [_es_name_exact_phrase(lit) for lit in must_set] | ||||
|     must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] | ||||
| 
 | ||||
|     if parsed_search_terms: | ||||
|         # Normal text search without the quoted parts | ||||
|         must_queries.append( | ||||
|             Q( | ||||
|                 'simple_query_string', | ||||
|                 # Query both fields, latter for words with >15 chars | ||||
|                 fields=['display_name', 'display_name.fullword'], | ||||
|                 analyzer='my_search_analyzer', | ||||
|                 default_operator="AND", | ||||
|                 query=parsed_search_terms | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     if must_queries or must_not_queries: | ||||
|         # Create a combined Query with the positive and negative matches | ||||
|         combined_search_query = Q( | ||||
|             'bool', | ||||
|             must=must_queries, | ||||
|             must_not=must_not_queries | ||||
|         ) | ||||
|         search = search.query(combined_search_query) | ||||
| 
 | ||||
|     return search | ||||
| 
 | ||||
| 
 | ||||
| def search_elastic(term='', user=None, sort='id', order='desc', | ||||
|                    category='0_0', quality_filter='0', page=1, | ||||
|                    rss=False, admin=False, logged_in_user=None, | ||||
|  | @ -165,12 +238,8 @@ def search_elastic(term='', user=None, sort='id', order='desc', | |||
| 
 | ||||
|     # Apply search term | ||||
|     if term: | ||||
|         s = s.query('simple_query_string', | ||||
|                     # Query both fields, latter for words with >15 chars | ||||
|                     fields=['display_name', 'display_name.fullword'], | ||||
|                     analyzer='my_search_analyzer', | ||||
|                     default_operator="AND", | ||||
|                     query=term) | ||||
|         # Do some preprocessing on the search terms for literal "" matching | ||||
|         s = _parse_es_search_terms(s, term) | ||||
| 
 | ||||
|     # User view (/user/username) | ||||
|     if user: | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Anna-Maria Meriniemi
						Anna-Maria Meriniemi