✨ feat: enhance search capabilities and indexing across collections
This commit is contained in:
@@ -79,6 +79,7 @@ Tibi supports multiple search modes via collection `search:` config:
|
|||||||
- `filter`
|
- `filter`
|
||||||
- `ngram`
|
- `ngram`
|
||||||
- `vector`
|
- `vector`
|
||||||
|
- `combined`
|
||||||
|
|
||||||
Use explicit search configs when search is a real product feature. Auto-fallback is useful, but it is not a substitute for a deliberate retrieval model.
|
Use explicit search configs when search is a real product feature. Auto-fallback is useful, but it is not a substitute for a deliberate retrieval model.
|
||||||
|
|
||||||
@@ -92,7 +93,7 @@ Use when:
|
|||||||
- exact field ownership of the text index is clear
|
- exact field ownership of the text index is clear
|
||||||
- keyword search is enough
|
- keyword search is enough
|
||||||
|
|
||||||
Requires a text index.
|
Requires a MongoDB text index (`$text: $**` or specific).
|
||||||
|
|
||||||
### `regex`
|
### `regex`
|
||||||
|
|
||||||
@@ -100,9 +101,16 @@ Use when:
|
|||||||
|
|
||||||
- the searchable fields are explicit
|
- the searchable fields are explicit
|
||||||
- case-insensitive matching is enough
|
- case-insensitive matching is enough
|
||||||
- weighted field scoring is useful
|
- weighted field scoring is useful (via `regex.weights: { "meta.title": 10, path: 5 }`)
|
||||||
|
|
||||||
Good for smaller datasets or precise keyed fields.
|
Good for smaller datasets or precise keyed fields. Very easy to configure without external dependencies. Example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
search:
|
||||||
|
- name: default
|
||||||
|
mode: regex
|
||||||
|
fields: [title, "alt.de", description]
|
||||||
|
```
|
||||||
|
|
||||||
### `filter` or `eval`
|
### `filter` or `eval`
|
||||||
|
|
||||||
@@ -121,23 +129,62 @@ Use when:
|
|||||||
- users search codes, names, transliterated terms, or partial inputs
|
- users search codes, names, transliterated terms, or partial inputs
|
||||||
|
|
||||||
This is enrichment-based search. It stores generated `_search` data and benefits from clear regeneration expectations.
|
This is enrichment-based search. It stores generated `_search` data and benefits from clear regeneration expectations.
|
||||||
|
_Note:_ Field weighting is not natively supported inside a single `ngram` mode, because all `fields` are concatenated into one large ngram index block per document.
|
||||||
|
|
||||||
### `vector`
|
### `vector`
|
||||||
|
|
||||||
Use when:
|
Use when:
|
||||||
|
|
||||||
- semantic similarity matters more than literal keyword overlap
|
- semantic similarity matters more than literal keyword overlap
|
||||||
- the project can support embedding-provider setup and operator cost expectations
|
- the project can support embedding-provider setup (e.g. `bge-m3` in `api/config.yml`)
|
||||||
- search quality justifies added complexity
|
- search quality justifies added complexity
|
||||||
|
|
||||||
Vector mode can use:
|
Vector mode requires a registered provider.
|
||||||
|
|
||||||
- `fields`
|
### `combined` (RRF)
|
||||||
- custom `eval` transformation
|
|
||||||
- `documentPrefix`
|
Use when:
|
||||||
- `queryPrefix`
|
|
||||||
- `overflow: truncate|chunk`
|
- Hybrid search is required (e.g. `vector` + `ngram` to catch typos and semantic meaning).
|
||||||
- `rrf` tuning for hybrid scoring
|
- You need to simulate field-weighting for `vector` or `ngram` by breaking them up into multiple search blocks and fusing them with different weights.
|
||||||
|
|
||||||
|
`mode: combined` uses Reciprocal Rank Fusion (RRF). It delegates execution to other configured search blocks (which should be hidden in admin UI via `meta.hide: true`).
|
||||||
|
|
||||||
|
**Field-Weighting Workaround with combined:**
|
||||||
|
Because `vector` and `ngram` concatenate all fields, you can weight highly important fields (like titles) higher than deep content fields by creating multiple ngram/vector blocks and boosting the important one in the `combined` weights:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
search:
|
||||||
|
- name: main_search
|
||||||
|
mode: combined
|
||||||
|
rrf:
|
||||||
|
k: 60
|
||||||
|
topK: 100
|
||||||
|
weights:
|
||||||
|
semantic: 1.5
|
||||||
|
fuzzy_important: 2.0 # Boosts matches in title/headline
|
||||||
|
fuzzy_content: 0.5 # Lowers weight for deep text matches
|
||||||
|
meta:
|
||||||
|
label: { de: "Suche", en: "Search" }
|
||||||
|
|
||||||
|
- name: fuzzy_important
|
||||||
|
mode: ngram
|
||||||
|
fields: [name, "meta.title", "blocks.headline"]
|
||||||
|
autoRegenerate: true
|
||||||
|
meta: { hide: true }
|
||||||
|
|
||||||
|
- name: fuzzy_content
|
||||||
|
mode: ngram
|
||||||
|
fields: ["blocks.text", "blocks.items.answer"]
|
||||||
|
autoRegenerate: true
|
||||||
|
meta: { hide: true }
|
||||||
|
|
||||||
|
- name: semantic
|
||||||
|
mode: vector
|
||||||
|
fields: [name, "meta.title", "blocks.text"]
|
||||||
|
vector: { provider: bge-m3 }
|
||||||
|
autoRegenerate: true
|
||||||
|
```
|
||||||
|
|
||||||
## Auto-regeneration and admin flows
|
## Auto-regeneration and admin flows
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,18 @@ hooks:
|
|||||||
type: javascript
|
type: javascript
|
||||||
file: hooks/filter_public.js
|
file: hooks/filter_public.js
|
||||||
|
|
||||||
|
indexes:
|
||||||
|
- name: content_active
|
||||||
|
key:
|
||||||
|
- contentId
|
||||||
|
- active
|
||||||
|
background: true
|
||||||
|
|
||||||
|
search:
|
||||||
|
- name: default
|
||||||
|
mode: regex
|
||||||
|
fields: [author, message, contentId]
|
||||||
|
|
||||||
fields:
|
fields:
|
||||||
- name: active
|
- name: active
|
||||||
type: boolean
|
type: boolean
|
||||||
|
|||||||
@@ -52,6 +52,77 @@ permissions:
|
|||||||
put: true
|
put: true
|
||||||
delete: true
|
delete: true
|
||||||
|
|
||||||
|
indexes:
|
||||||
|
- name: content_text
|
||||||
|
key:
|
||||||
|
- "$text:$**"
|
||||||
|
defaultLanguage: none
|
||||||
|
background: true
|
||||||
|
- name: path_lang
|
||||||
|
key:
|
||||||
|
- path
|
||||||
|
- lang
|
||||||
|
unique: true
|
||||||
|
background: true
|
||||||
|
- name: translation
|
||||||
|
key:
|
||||||
|
- translationKey
|
||||||
|
- lang
|
||||||
|
unique: true
|
||||||
|
background: true
|
||||||
|
|
||||||
|
search:
|
||||||
|
- name: combined
|
||||||
|
mode: combined
|
||||||
|
rrf:
|
||||||
|
k: 60
|
||||||
|
topK: 100
|
||||||
|
weights:
|
||||||
|
semantic: 1.5
|
||||||
|
fuzzy: 1.0
|
||||||
|
meta:
|
||||||
|
label: { de: "Suche (Kombiniert)", en: "Search (Combined)" }
|
||||||
|
|
||||||
|
- name: fuzzy
|
||||||
|
mode: ngram
|
||||||
|
autoRegenerate: true
|
||||||
|
fields:
|
||||||
|
[
|
||||||
|
name,
|
||||||
|
path,
|
||||||
|
"meta.title",
|
||||||
|
"meta.description",
|
||||||
|
"blocks.headline",
|
||||||
|
"blocks.subline",
|
||||||
|
"blocks.text",
|
||||||
|
"blocks.items.title",
|
||||||
|
"blocks.items.text",
|
||||||
|
"blocks.items.question",
|
||||||
|
"blocks.items.answer",
|
||||||
|
]
|
||||||
|
meta:
|
||||||
|
label: { de: "Fuzzy (n-gram)", en: "Fuzzy (n-gram)" }
|
||||||
|
|
||||||
|
- name: semantic
|
||||||
|
mode: vector
|
||||||
|
autoRegenerate: true
|
||||||
|
fields:
|
||||||
|
[
|
||||||
|
name,
|
||||||
|
"meta.title",
|
||||||
|
"meta.description",
|
||||||
|
"blocks.headline",
|
||||||
|
"blocks.subline",
|
||||||
|
"blocks.text",
|
||||||
|
"blocks.items.title",
|
||||||
|
"blocks.items.text",
|
||||||
|
"blocks.items.question",
|
||||||
|
"blocks.items.answer",
|
||||||
|
]
|
||||||
|
vector: { provider: bge-m3 }
|
||||||
|
meta:
|
||||||
|
label: { de: "Semantisch (Vektor)", en: "Semantic (Vector)" }
|
||||||
|
|
||||||
imageFilter: !include lib/imageFilter.yml
|
imageFilter: !include lib/imageFilter.yml
|
||||||
|
|
||||||
fields:
|
fields:
|
||||||
|
|||||||
@@ -58,6 +58,11 @@ permissions:
|
|||||||
|
|
||||||
imageFilter: !include lib/imageFilter.yml
|
imageFilter: !include lib/imageFilter.yml
|
||||||
|
|
||||||
|
search:
|
||||||
|
- name: default
|
||||||
|
mode: regex
|
||||||
|
fields: [title, "alt.de", "alt.en", description]
|
||||||
|
|
||||||
fields:
|
fields:
|
||||||
- name: file
|
- name: file
|
||||||
type: file
|
type: file
|
||||||
|
|||||||
@@ -58,6 +58,19 @@ permissions:
|
|||||||
put: true
|
put: true
|
||||||
delete: true
|
delete: true
|
||||||
|
|
||||||
|
indexes:
|
||||||
|
- name: type_language
|
||||||
|
key:
|
||||||
|
- type
|
||||||
|
- language
|
||||||
|
unique: true
|
||||||
|
background: true
|
||||||
|
|
||||||
|
search:
|
||||||
|
- name: default
|
||||||
|
mode: regex
|
||||||
|
fields: [type, "elements.name", "elements.page"]
|
||||||
|
|
||||||
fields:
|
fields:
|
||||||
- name: language
|
- name: language
|
||||||
type: string
|
type: string
|
||||||
|
|||||||
@@ -65,6 +65,18 @@ permissions:
|
|||||||
put: true
|
put: true
|
||||||
delete: true
|
delete: true
|
||||||
|
|
||||||
|
indexes:
|
||||||
|
- name: name_unique
|
||||||
|
key:
|
||||||
|
- name
|
||||||
|
unique: true
|
||||||
|
background: true
|
||||||
|
|
||||||
|
search:
|
||||||
|
- name: default
|
||||||
|
mode: regex
|
||||||
|
fields: [name, group]
|
||||||
|
|
||||||
fields:
|
fields:
|
||||||
- name: name
|
- name: name
|
||||||
type: string
|
type: string
|
||||||
|
|||||||
+1
-1
@@ -1,2 +1,2 @@
|
|||||||
ADMIN_TOKEN=5bdfjc78hdxn338cuhSJ
|
ADMIN_TOKEN=5bdfjc78hdxn338cuhSJ
|
||||||
ADMIN_ASSET_VERSION=f407946-dirty-1779042833381
|
ADMIN_ASSET_VERSION=8cbf0db-dirty-1779049064994
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ services:
|
|||||||
image: gitbase.de/cms/tibi-server:nova
|
image: gitbase.de/cms/tibi-server:nova
|
||||||
volumes:
|
volumes:
|
||||||
- ./:/data
|
- ./:/data
|
||||||
|
- /WM_Dev/models:/models
|
||||||
environment:
|
environment:
|
||||||
DB_DIAL: mongodb://mongo
|
DB_DIAL: mongodb://mongo
|
||||||
DB_PREFIX: ${TIBI_PREFIX}
|
DB_PREFIX: ${TIBI_PREFIX}
|
||||||
@@ -47,6 +48,15 @@ services:
|
|||||||
SECURITY_ALLOWABSOLUTEPATHS: "true"
|
SECURITY_ALLOWABSOLUTEPATHS: "true"
|
||||||
SECURITY_ALLOWUPPERPATHS: "true"
|
SECURITY_ALLOWUPPERPATHS: "true"
|
||||||
RESPONSE_ERROR_STACK: "true"
|
RESPONSE_ERROR_STACK: "true"
|
||||||
|
# Mache BGE-M3 systemweit verfuegbar (und damit default, wenn sonst nichts angegeben ist)
|
||||||
|
EMBEDDING_PROVIDERS: bge-m3
|
||||||
|
# Die BGE-M3 spezifischen Configurations-Sets
|
||||||
|
EMBEDDING_BGE-M3_TYPE: native
|
||||||
|
EMBEDDING_BGE-M3_MODELPATH: /models/bge-m3
|
||||||
|
EMBEDDING_BGE-M3_DIMENSIONS: 1024
|
||||||
|
# Die Vektor Prefixe, die dann in Vector-Collections als Default greifen
|
||||||
|
EMBEDDING_BGE-M3_DOCUMENTPREFIX: "search_document: "
|
||||||
|
EMBEDDING_BGE-M3_QUERYPREFIX: "search_query: "
|
||||||
depends_on:
|
depends_on:
|
||||||
- mongo
|
- mongo
|
||||||
expose:
|
expose:
|
||||||
|
|||||||
Reference in New Issue
Block a user