feat: enhance search capabilities and indexing across collections

This commit is contained in:
2026-05-17 20:33:22 +00:00
parent 8cbf0db14f
commit 2d52272b2e
8 changed files with 182 additions and 12 deletions
+58 -11
View File
@@ -79,6 +79,7 @@ Tibi supports multiple search modes via collection `search:` config:
- `filter`
- `ngram`
- `vector`
- `combined`
Use explicit search configs when search is a real product feature. Auto-fallback is useful, but it is not a substitute for a deliberate retrieval model.
@@ -92,7 +93,7 @@ Use when:
- exact field ownership of the text index is clear
- keyword search is enough
Requires a text index.
Requires a MongoDB text index (`$text: $**` or specific).
### `regex`
@@ -100,9 +101,16 @@ Use when:
- the searchable fields are explicit
- case-insensitive matching is enough
- weighted field scoring is useful
- weighted field scoring is useful (via `regex.weights: { "meta.title": 10, path: 5 }`)
Good for smaller datasets or precise keyed fields.
Good for smaller datasets or precise keyed fields. Very easy to configure without external dependencies. Example:
```yaml
search:
- name: default
mode: regex
fields: [title, "alt.de", description]
```
### `filter` or `eval`
@@ -121,23 +129,62 @@ Use when:
- users search codes, names, transliterated terms, or partial inputs
This is enrichment-based search. It stores generated `_search` data and benefits from clear regeneration expectations.
_Note:_ Field weighting is not natively supported inside a single `ngram` mode, because all `fields` are concatenated into one large ngram index block per document.
### `vector`
Use when:
- semantic similarity matters more than literal keyword overlap
- the project can support embedding-provider setup and operator cost expectations
- the project can support embedding-provider setup (e.g. `bge-m3` in `api/config.yml`)
- search quality justifies added complexity
Vector mode can use:
Vector mode requires a registered provider.
- `fields`
- custom `eval` transformation
- `documentPrefix`
- `queryPrefix`
- `overflow: truncate|chunk`
- `rrf` tuning for hybrid scoring
### `combined` (RRF)
Use when:
- Hybrid search is required (e.g. `vector` + `ngram` to catch typos and semantic meaning).
- You need to simulate field-weighting for `vector` or `ngram` by breaking them up into multiple search blocks and fusing them with different weights.
`mode: combined` uses Reciprocal Rank Fusion (RRF). It delegates execution to other configured search blocks (which should be hidden in admin UI via `meta.hide: true`).
**Field-Weighting Workaround with combined:**
Because `vector` and `ngram` concatenate all fields, you can weight highly important fields (like titles) higher than deep content fields by creating multiple ngram/vector blocks and boosting the important one in the `combined` weights:
```yaml
search:
- name: main_search
mode: combined
rrf:
k: 60
topK: 100
weights:
semantic: 1.5
fuzzy_important: 2.0 # Boosts matches in title/headline
fuzzy_content: 0.5 # Lowers weight for deep text matches
meta:
label: { de: "Suche", en: "Search" }
- name: fuzzy_important
mode: ngram
fields: [name, "meta.title", "blocks.headline"]
autoRegenerate: true
meta: { hide: true }
- name: fuzzy_content
mode: ngram
fields: ["blocks.text", "blocks.items.answer"]
autoRegenerate: true
meta: { hide: true }
- name: semantic
mode: vector
fields: [name, "meta.title", "blocks.text"]
vector: { provider: bge-m3 }
autoRegenerate: true
```
## Auto-regeneration and admin flows
+12
View File
@@ -32,6 +32,18 @@ hooks:
type: javascript
file: hooks/filter_public.js
indexes:
- name: content_active
key:
- contentId
- active
background: true
search:
- name: default
mode: regex
fields: [author, message, contentId]
fields:
- name: active
type: boolean
+71
View File
@@ -52,6 +52,77 @@ permissions:
put: true
delete: true
indexes:
- name: content_text
key:
- "$text:$**"
defaultLanguage: none
background: true
- name: path_lang
key:
- path
- lang
unique: true
background: true
- name: translation
key:
- translationKey
- lang
unique: true
background: true
search:
- name: combined
mode: combined
rrf:
k: 60
topK: 100
weights:
semantic: 1.5
fuzzy: 1.0
meta:
label: { de: "Suche (Kombiniert)", en: "Search (Combined)" }
- name: fuzzy
mode: ngram
autoRegenerate: true
fields:
[
name,
path,
"meta.title",
"meta.description",
"blocks.headline",
"blocks.subline",
"blocks.text",
"blocks.items.title",
"blocks.items.text",
"blocks.items.question",
"blocks.items.answer",
]
meta:
label: { de: "Fuzzy (n-gram)", en: "Fuzzy (n-gram)" }
- name: semantic
mode: vector
autoRegenerate: true
fields:
[
name,
"meta.title",
"meta.description",
"blocks.headline",
"blocks.subline",
"blocks.text",
"blocks.items.title",
"blocks.items.text",
"blocks.items.question",
"blocks.items.answer",
]
vector: { provider: bge-m3 }
meta:
label: { de: "Semantisch (Vektor)", en: "Semantic (Vector)" }
imageFilter: !include lib/imageFilter.yml
fields:
+5
View File
@@ -58,6 +58,11 @@ permissions:
imageFilter: !include lib/imageFilter.yml
search:
- name: default
mode: regex
fields: [title, "alt.de", "alt.en", description]
fields:
- name: file
type: file
+13
View File
@@ -58,6 +58,19 @@ permissions:
put: true
delete: true
indexes:
- name: type_language
key:
- type
- language
unique: true
background: true
search:
- name: default
mode: regex
fields: [type, "elements.name", "elements.page"]
fields:
- name: language
type: string
+12
View File
@@ -65,6 +65,18 @@ permissions:
put: true
delete: true
indexes:
- name: name_unique
key:
- name
unique: true
background: true
search:
- name: default
mode: regex
fields: [name, group]
fields:
- name: name
type: string
+1 -1
View File
@@ -1,2 +1,2 @@
ADMIN_TOKEN=5bdfjc78hdxn338cuhSJ
ADMIN_ASSET_VERSION=f407946-dirty-1779042833381
ADMIN_ASSET_VERSION=8cbf0db-dirty-1779049064994
+10
View File
@@ -40,6 +40,7 @@ services:
image: gitbase.de/cms/tibi-server:nova
volumes:
- ./:/data
- /WM_Dev/models:/models
environment:
DB_DIAL: mongodb://mongo
DB_PREFIX: ${TIBI_PREFIX}
@@ -47,6 +48,15 @@ services:
SECURITY_ALLOWABSOLUTEPATHS: "true"
SECURITY_ALLOWUPPERPATHS: "true"
RESPONSE_ERROR_STACK: "true"
# Mache BGE-M3 systemweit verfuegbar (und damit default, wenn sonst nichts angegeben ist)
EMBEDDING_PROVIDERS: bge-m3
# Die BGE-M3 spezifischen Configurations-Sets
EMBEDDING_BGE-M3_TYPE: native
EMBEDDING_BGE-M3_MODELPATH: /models/bge-m3
EMBEDDING_BGE-M3_DIMENSIONS: 1024
# Die Vektor Prefixe, die dann in Vector-Collections als Default greifen
EMBEDDING_BGE-M3_DOCUMENTPREFIX: "search_document: "
EMBEDDING_BGE-M3_QUERYPREFIX: "search_query: "
depends_on:
- mongo
expose: