mirror of
				https://github.com/sb745/NyaaV3.git
				synced 2025-10-31 16:05:46 +02:00 
			
		
		
		
	Extend ES term preprocessing for OR groups
Implements handling "foo"|"bar" literal OR groups in the Elasticsearch term preprocessor. Groups can be negated with -, but don't mesh with precedence (like plain literals). This is a partial hack, the real solution would be to parse the entire search terms ourselves, with AND and OR groups, negations etc. But having that work neatly with the simple_query_string would be bit of a hassle.
This commit is contained in:
		
							parent
							
								
									0b78428abc
								
							
						
					
					
						commit
						87502978c3
					
				
					 1 changed files with 40 additions and 5 deletions
				
			
		|  | @ -70,7 +70,18 @@ def _generate_query_string(term, category, filter, user): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # For preprocessing ES search terms in _parse_es_search_terms | # For preprocessing ES search terms in _parse_es_search_terms | ||||||
| QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') | QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.+?)"') | ||||||
|  | QUOTED_LITERAL_GROUP_REGEX = re.compile(r''' | ||||||
|  |     (?i) | ||||||
|  |     (-)? # Negate entire group at once | ||||||
|  |     ( | ||||||
|  |         ".+?" # First literal | ||||||
|  |         (?: | ||||||
|  |             \|    # OR | ||||||
|  |             ".+?" # Second literal | ||||||
|  |         )+        # repeating | ||||||
|  |     ) | ||||||
|  |     ''', re.X) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _es_name_exact_phrase(literal): | def _es_name_exact_phrase(literal): | ||||||
|  | @ -98,7 +109,30 @@ def _parse_es_search_terms(search, search_terms): | ||||||
|     must_set = set() |     must_set = set() | ||||||
|     must_not_set = set() |     must_not_set = set() | ||||||
| 
 | 
 | ||||||
|     def literal_matcher(match): |     must_or_groups = [] | ||||||
|  |     must_not_or_groups = [] | ||||||
|  | 
 | ||||||
|  |     def must_group_matcher(match): | ||||||
|  |         ''' Grabs [-]"foo"|"bar"[|"baz"...] groups from the search terms ''' | ||||||
|  |         negated = bool(match.group(1)) | ||||||
|  |         literal_group = match.group(2) | ||||||
|  | 
 | ||||||
|  |         literals = QUOTED_LITERAL_REGEX.findall(literal_group) | ||||||
|  |         group_query = Q( | ||||||
|  |             'bool', | ||||||
|  |             should=[_es_name_exact_phrase(lit_m[1]) for lit_m in literals] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         if negated: | ||||||
|  |             must_not_or_groups.append(group_query) | ||||||
|  |         else: | ||||||
|  |             must_or_groups.append(group_query) | ||||||
|  | 
 | ||||||
|  |         # Remove the parsed group from search terms | ||||||
|  |         return '' | ||||||
|  | 
 | ||||||
|  |     def must_matcher(match): | ||||||
|  |         ''' Grabs [-]"foo" literals from the search terms ''' | ||||||
|         negated = bool(match.group(1)) |         negated = bool(match.group(1)) | ||||||
|         literal = match.group(2) |         literal = match.group(2) | ||||||
| 
 | 
 | ||||||
|  | @ -111,11 +145,12 @@ def _parse_es_search_terms(search, search_terms): | ||||||
|         return '' |         return '' | ||||||
| 
 | 
 | ||||||
|     # Remove quoted parts (optionally prepended with -) and store them in the sets |     # Remove quoted parts (optionally prepended with -) and store them in the sets | ||||||
|     parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() |     parsed_search_terms = QUOTED_LITERAL_GROUP_REGEX.sub(must_group_matcher, search_terms).strip() | ||||||
|  |     parsed_search_terms = QUOTED_LITERAL_REGEX.sub(must_matcher, parsed_search_terms).strip() | ||||||
| 
 | 
 | ||||||
|     # Create phrase matches (if any) |     # Create phrase matches (if any) | ||||||
|     must_queries = [_es_name_exact_phrase(lit) for lit in must_set] |     must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_or_groups | ||||||
|     must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] |     must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_not_or_groups | ||||||
| 
 | 
 | ||||||
|     if parsed_search_terms: |     if parsed_search_terms: | ||||||
|         # Normal text search without the quoted parts |         # Normal text search without the quoted parts | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 TheAMM
						TheAMM