forked from cms/tibi-svelte-starter
✨ feat: enhance search capabilities and indexing across collections
This commit is contained in:
@@ -79,6 +79,7 @@ Tibi supports multiple search modes via collection `search:` config:
|
||||
- `filter`
|
||||
- `ngram`
|
||||
- `vector`
|
||||
- `combined`
|
||||
|
||||
Use explicit search configs when search is a real product feature. Auto-fallback is useful, but it is not a substitute for a deliberate retrieval model.
|
||||
|
||||
@@ -92,7 +93,7 @@ Use when:
|
||||
- exact field ownership of the text index is clear
|
||||
- keyword search is enough
|
||||
|
||||
Requires a text index.
|
||||
Requires a MongoDB text index (`$text: $**` or specific).
|
||||
|
||||
### `regex`
|
||||
|
||||
@@ -100,9 +101,16 @@ Use when:
|
||||
|
||||
- the searchable fields are explicit
|
||||
- case-insensitive matching is enough
|
||||
- weighted field scoring is useful
|
||||
- weighted field scoring is useful (via `regex.weights: { "meta.title": 10, path: 5 }`)
|
||||
|
||||
Good for smaller datasets or precise keyed fields.
|
||||
Good for smaller datasets or precise keyed fields. Very easy to configure without external dependencies. Example:
|
||||
|
||||
```yaml
|
||||
search:
|
||||
- name: default
|
||||
mode: regex
|
||||
fields: [title, "alt.de", description]
|
||||
```
|
||||
|
||||
### `filter` or `eval`
|
||||
|
||||
@@ -121,23 +129,62 @@ Use when:
|
||||
- users search codes, names, transliterated terms, or partial inputs
|
||||
|
||||
This is enrichment-based search. It stores generated `_search` data and benefits from clear regeneration expectations.
|
||||
_Note:_ Field weighting is not natively supported inside a single `ngram` mode, because all `fields` are concatenated into one large ngram index block per document.
|
||||
|
||||
### `vector`
|
||||
|
||||
Use when:
|
||||
|
||||
- semantic similarity matters more than literal keyword overlap
|
||||
- the project can support embedding-provider setup and operator cost expectations
|
||||
- the project can support embedding-provider setup (e.g. `bge-m3` in `api/config.yml`)
|
||||
- search quality justifies added complexity
|
||||
|
||||
Vector mode can use:
|
||||
Vector mode requires a registered provider.
|
||||
|
||||
- `fields`
|
||||
- custom `eval` transformation
|
||||
- `documentPrefix`
|
||||
- `queryPrefix`
|
||||
- `overflow: truncate|chunk`
|
||||
- `rrf` tuning for hybrid scoring
|
||||
### `combined` (RRF)
|
||||
|
||||
Use when:
|
||||
|
||||
- Hybrid search is required (e.g. `vector` + `ngram` to catch typos and semantic meaning).
|
||||
- You need to simulate field-weighting for `vector` or `ngram` by breaking them up into multiple search blocks and fusing them with different weights.
|
||||
|
||||
`mode: combined` uses Reciprocal Rank Fusion (RRF). It delegates execution to other configured search blocks (which should be hidden in admin UI via `meta.hide: true`).
|
||||
|
||||
**Field-Weighting Workaround with combined:**
|
||||
Because `vector` and `ngram` concatenate all fields, you can weight highly important fields (like titles) higher than deep content fields by creating multiple ngram/vector blocks and boosting the important one in the `combined` weights:
|
||||
|
||||
```yaml
|
||||
search:
|
||||
- name: main_search
|
||||
mode: combined
|
||||
rrf:
|
||||
k: 60
|
||||
topK: 100
|
||||
weights:
|
||||
semantic: 1.5
|
||||
fuzzy_important: 2.0 # Boosts matches in title/headline
|
||||
fuzzy_content: 0.5 # Lowers weight for deep text matches
|
||||
meta:
|
||||
label: { de: "Suche", en: "Search" }
|
||||
|
||||
- name: fuzzy_important
|
||||
mode: ngram
|
||||
fields: [name, "meta.title", "blocks.headline"]
|
||||
autoRegenerate: true
|
||||
meta: { hide: true }
|
||||
|
||||
- name: fuzzy_content
|
||||
mode: ngram
|
||||
fields: ["blocks.text", "blocks.items.answer"]
|
||||
autoRegenerate: true
|
||||
meta: { hide: true }
|
||||
|
||||
- name: semantic
|
||||
mode: vector
|
||||
fields: [name, "meta.title", "blocks.text"]
|
||||
vector: { provider: bge-m3 }
|
||||
autoRegenerate: true
|
||||
```
|
||||
|
||||
## Auto-regeneration and admin flows
|
||||
|
||||
|
||||
@@ -32,6 +32,18 @@ hooks:
|
||||
type: javascript
|
||||
file: hooks/filter_public.js
|
||||
|
||||
indexes:
|
||||
- name: content_active
|
||||
key:
|
||||
- contentId
|
||||
- active
|
||||
background: true
|
||||
|
||||
search:
|
||||
- name: default
|
||||
mode: regex
|
||||
fields: [author, message, contentId]
|
||||
|
||||
fields:
|
||||
- name: active
|
||||
type: boolean
|
||||
|
||||
@@ -52,6 +52,77 @@ permissions:
|
||||
put: true
|
||||
delete: true
|
||||
|
||||
indexes:
|
||||
- name: content_text
|
||||
key:
|
||||
- "$text:$**"
|
||||
defaultLanguage: none
|
||||
background: true
|
||||
- name: path_lang
|
||||
key:
|
||||
- path
|
||||
- lang
|
||||
unique: true
|
||||
background: true
|
||||
- name: translation
|
||||
key:
|
||||
- translationKey
|
||||
- lang
|
||||
unique: true
|
||||
background: true
|
||||
|
||||
search:
|
||||
- name: combined
|
||||
mode: combined
|
||||
rrf:
|
||||
k: 60
|
||||
topK: 100
|
||||
weights:
|
||||
semantic: 1.5
|
||||
fuzzy: 1.0
|
||||
meta:
|
||||
label: { de: "Suche (Kombiniert)", en: "Search (Combined)" }
|
||||
|
||||
- name: fuzzy
|
||||
mode: ngram
|
||||
autoRegenerate: true
|
||||
fields:
|
||||
[
|
||||
name,
|
||||
path,
|
||||
"meta.title",
|
||||
"meta.description",
|
||||
"blocks.headline",
|
||||
"blocks.subline",
|
||||
"blocks.text",
|
||||
"blocks.items.title",
|
||||
"blocks.items.text",
|
||||
"blocks.items.question",
|
||||
"blocks.items.answer",
|
||||
]
|
||||
meta:
|
||||
label: { de: "Fuzzy (n-gram)", en: "Fuzzy (n-gram)" }
|
||||
|
||||
- name: semantic
|
||||
mode: vector
|
||||
autoRegenerate: true
|
||||
fields:
|
||||
[
|
||||
name,
|
||||
"meta.title",
|
||||
"meta.description",
|
||||
"blocks.headline",
|
||||
"blocks.subline",
|
||||
"blocks.text",
|
||||
"blocks.items.title",
|
||||
"blocks.items.text",
|
||||
"blocks.items.question",
|
||||
"blocks.items.answer",
|
||||
]
|
||||
vector: { provider: bge-m3 }
|
||||
meta:
|
||||
label: { de: "Semantisch (Vektor)", en: "Semantic (Vector)" }
|
||||
|
||||
imageFilter: !include lib/imageFilter.yml
|
||||
|
||||
fields:
|
||||
|
||||
@@ -58,6 +58,11 @@ permissions:
|
||||
|
||||
imageFilter: !include lib/imageFilter.yml
|
||||
|
||||
search:
|
||||
- name: default
|
||||
mode: regex
|
||||
fields: [title, "alt.de", "alt.en", description]
|
||||
|
||||
fields:
|
||||
- name: file
|
||||
type: file
|
||||
|
||||
@@ -58,6 +58,19 @@ permissions:
|
||||
put: true
|
||||
delete: true
|
||||
|
||||
indexes:
|
||||
- name: type_language
|
||||
key:
|
||||
- type
|
||||
- language
|
||||
unique: true
|
||||
background: true
|
||||
|
||||
search:
|
||||
- name: default
|
||||
mode: regex
|
||||
fields: [type, "elements.name", "elements.page"]
|
||||
|
||||
fields:
|
||||
- name: language
|
||||
type: string
|
||||
|
||||
@@ -65,6 +65,18 @@ permissions:
|
||||
put: true
|
||||
delete: true
|
||||
|
||||
indexes:
|
||||
- name: name_unique
|
||||
key:
|
||||
- name
|
||||
unique: true
|
||||
background: true
|
||||
|
||||
search:
|
||||
- name: default
|
||||
mode: regex
|
||||
fields: [name, group]
|
||||
|
||||
fields:
|
||||
- name: name
|
||||
type: string
|
||||
|
||||
+1
-1
@@ -1,2 +1,2 @@
|
||||
ADMIN_TOKEN=5bdfjc78hdxn338cuhSJ
|
||||
ADMIN_ASSET_VERSION=f407946-dirty-1779042833381
|
||||
ADMIN_ASSET_VERSION=8cbf0db-dirty-1779049064994
|
||||
|
||||
@@ -40,6 +40,7 @@ services:
|
||||
image: gitbase.de/cms/tibi-server:nova
|
||||
volumes:
|
||||
- ./:/data
|
||||
- /WM_Dev/models:/models
|
||||
environment:
|
||||
DB_DIAL: mongodb://mongo
|
||||
DB_PREFIX: ${TIBI_PREFIX}
|
||||
@@ -47,6 +48,15 @@ services:
|
||||
SECURITY_ALLOWABSOLUTEPATHS: "true"
|
||||
SECURITY_ALLOWUPPERPATHS: "true"
|
||||
RESPONSE_ERROR_STACK: "true"
|
||||
# Mache BGE-M3 systemweit verfuegbar (und damit default, wenn sonst nichts angegeben ist)
|
||||
EMBEDDING_PROVIDERS: bge-m3
|
||||
# Die BGE-M3 spezifischen Configurations-Sets
|
||||
EMBEDDING_BGE-M3_TYPE: native
|
||||
EMBEDDING_BGE-M3_MODELPATH: /models/bge-m3
|
||||
EMBEDDING_BGE-M3_DIMENSIONS: 1024
|
||||
# Die Vektor Prefixe, die dann in Vector-Collections als Default greifen
|
||||
EMBEDDING_BGE-M3_DOCUMENTPREFIX: "search_document: "
|
||||
EMBEDDING_BGE-M3_QUERYPREFIX: "search_query: "
|
||||
depends_on:
|
||||
- mongo
|
||||
expose:
|
||||
|
||||
Reference in New Issue
Block a user