From 2d52272b2e3bc7701543dc9a27c325c602540432 Mon Sep 17 00:00:00 2001 From: Sebastian Frank Date: Sun, 17 May 2026 20:33:22 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20enhance=20search=20capabili?= =?UTF-8?q?ties=20and=20indexing=20across=20collections?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .agents/skills/search-and-embeddings/SKILL.md | 69 +++++++++++++++--- api/collections/comment.yml | 12 ++++ api/collections/content.yml | 71 +++++++++++++++++++ api/collections/medialib.yml | 5 ++ api/collections/navigation.yml | 13 ++++ api/collections/tag.yml | 12 ++++ api/config.yml.env | 2 +- docker-compose-local.yml | 10 +++ 8 files changed, 182 insertions(+), 12 deletions(-) diff --git a/.agents/skills/search-and-embeddings/SKILL.md b/.agents/skills/search-and-embeddings/SKILL.md index 176d596..886ef39 100644 --- a/.agents/skills/search-and-embeddings/SKILL.md +++ b/.agents/skills/search-and-embeddings/SKILL.md @@ -79,6 +79,7 @@ Tibi supports multiple search modes via collection `search:` config: - `filter` - `ngram` - `vector` +- `combined` Use explicit search configs when search is a real product feature. Auto-fallback is useful, but it is not a substitute for a deliberate retrieval model. @@ -92,7 +93,7 @@ Use when: - exact field ownership of the text index is clear - keyword search is enough -Requires a text index. +Requires a MongoDB text index (`$text: $**` or specific). ### `regex` @@ -100,9 +101,16 @@ Use when: - the searchable fields are explicit - case-insensitive matching is enough -- weighted field scoring is useful +- weighted field scoring is useful (via `regex.weights: { "meta.title": 10, path: 5 }`) -Good for smaller datasets or precise keyed fields. +Good for smaller datasets or precise keyed fields. Very easy to configure without external dependencies. Example: + +```yaml +search: + - name: default + mode: regex + fields: [title, "alt.de", description] +``` ### `filter` or `eval` @@ -121,23 +129,62 @@ Use when: - users search codes, names, transliterated terms, or partial inputs This is enrichment-based search. It stores generated `_search` data and benefits from clear regeneration expectations. +_Note:_ Field weighting is not natively supported inside a single `ngram` mode, because all `fields` are concatenated into one large ngram index block per document. ### `vector` Use when: - semantic similarity matters more than literal keyword overlap -- the project can support embedding-provider setup and operator cost expectations +- the project can support embedding-provider setup (e.g. `bge-m3` in `api/config.yml`) - search quality justifies added complexity -Vector mode can use: +Vector mode requires a registered provider. -- `fields` -- custom `eval` transformation -- `documentPrefix` -- `queryPrefix` -- `overflow: truncate|chunk` -- `rrf` tuning for hybrid scoring +### `combined` (RRF) + +Use when: + +- Hybrid search is required (e.g. `vector` + `ngram` to catch typos and semantic meaning). +- You need to simulate field-weighting for `vector` or `ngram` by breaking them up into multiple search blocks and fusing them with different weights. + +`mode: combined` uses Reciprocal Rank Fusion (RRF). It delegates execution to other configured search blocks (which should be hidden in admin UI via `meta.hide: true`). + +**Field-Weighting Workaround with combined:** +Because `vector` and `ngram` concatenate all fields, you can weight highly important fields (like titles) higher than deep content fields by creating multiple ngram/vector blocks and boosting the important one in the `combined` weights: + +```yaml +search: + - name: main_search + mode: combined + rrf: + k: 60 + topK: 100 + weights: + semantic: 1.5 + fuzzy_important: 2.0 # Boosts matches in title/headline + fuzzy_content: 0.5 # Lowers weight for deep text matches + meta: + label: { de: "Suche", en: "Search" } + + - name: fuzzy_important + mode: ngram + fields: [name, "meta.title", "blocks.headline"] + autoRegenerate: true + meta: { hide: true } + + - name: fuzzy_content + mode: ngram + fields: ["blocks.text", "blocks.items.answer"] + autoRegenerate: true + meta: { hide: true } + + - name: semantic + mode: vector + fields: [name, "meta.title", "blocks.text"] + vector: { provider: bge-m3 } + autoRegenerate: true +``` ## Auto-regeneration and admin flows diff --git a/api/collections/comment.yml b/api/collections/comment.yml index 05fad29..2adb259 100644 --- a/api/collections/comment.yml +++ b/api/collections/comment.yml @@ -32,6 +32,18 @@ hooks: type: javascript file: hooks/filter_public.js +indexes: + - name: content_active + key: + - contentId + - active + background: true + +search: + - name: default + mode: regex + fields: [author, message, contentId] + fields: - name: active type: boolean diff --git a/api/collections/content.yml b/api/collections/content.yml index 9475635..7cdb214 100644 --- a/api/collections/content.yml +++ b/api/collections/content.yml @@ -52,6 +52,77 @@ permissions: put: true delete: true +indexes: + - name: content_text + key: + - "$text:$**" + defaultLanguage: none + background: true + - name: path_lang + key: + - path + - lang + unique: true + background: true + - name: translation + key: + - translationKey + - lang + unique: true + background: true + +search: + - name: combined + mode: combined + rrf: + k: 60 + topK: 100 + weights: + semantic: 1.5 + fuzzy: 1.0 + meta: + label: { de: "Suche (Kombiniert)", en: "Search (Combined)" } + + - name: fuzzy + mode: ngram + autoRegenerate: true + fields: + [ + name, + path, + "meta.title", + "meta.description", + "blocks.headline", + "blocks.subline", + "blocks.text", + "blocks.items.title", + "blocks.items.text", + "blocks.items.question", + "blocks.items.answer", + ] + meta: + label: { de: "Fuzzy (n-gram)", en: "Fuzzy (n-gram)" } + + - name: semantic + mode: vector + autoRegenerate: true + fields: + [ + name, + "meta.title", + "meta.description", + "blocks.headline", + "blocks.subline", + "blocks.text", + "blocks.items.title", + "blocks.items.text", + "blocks.items.question", + "blocks.items.answer", + ] + vector: { provider: bge-m3 } + meta: + label: { de: "Semantisch (Vektor)", en: "Semantic (Vector)" } + imageFilter: !include lib/imageFilter.yml fields: diff --git a/api/collections/medialib.yml b/api/collections/medialib.yml index 30974a8..f4755c4 100644 --- a/api/collections/medialib.yml +++ b/api/collections/medialib.yml @@ -58,6 +58,11 @@ permissions: imageFilter: !include lib/imageFilter.yml +search: + - name: default + mode: regex + fields: [title, "alt.de", "alt.en", description] + fields: - name: file type: file diff --git a/api/collections/navigation.yml b/api/collections/navigation.yml index 19d8024..9cbd3b5 100644 --- a/api/collections/navigation.yml +++ b/api/collections/navigation.yml @@ -58,6 +58,19 @@ permissions: put: true delete: true +indexes: + - name: type_language + key: + - type + - language + unique: true + background: true + +search: + - name: default + mode: regex + fields: [type, "elements.name", "elements.page"] + fields: - name: language type: string diff --git a/api/collections/tag.yml b/api/collections/tag.yml index 7cbe552..74ba0b4 100644 --- a/api/collections/tag.yml +++ b/api/collections/tag.yml @@ -65,6 +65,18 @@ permissions: put: true delete: true +indexes: + - name: name_unique + key: + - name + unique: true + background: true + +search: + - name: default + mode: regex + fields: [name, group] + fields: - name: name type: string diff --git a/api/config.yml.env b/api/config.yml.env index 3087732..f2fad4d 100644 --- a/api/config.yml.env +++ b/api/config.yml.env @@ -1,2 +1,2 @@ ADMIN_TOKEN=5bdfjc78hdxn338cuhSJ -ADMIN_ASSET_VERSION=f407946-dirty-1779042833381 +ADMIN_ASSET_VERSION=8cbf0db-dirty-1779049064994 diff --git a/docker-compose-local.yml b/docker-compose-local.yml index 7820293..4d803ab 100644 --- a/docker-compose-local.yml +++ b/docker-compose-local.yml @@ -40,6 +40,7 @@ services: image: gitbase.de/cms/tibi-server:nova volumes: - ./:/data + - /WM_Dev/models:/models environment: DB_DIAL: mongodb://mongo DB_PREFIX: ${TIBI_PREFIX} @@ -47,6 +48,15 @@ services: SECURITY_ALLOWABSOLUTEPATHS: "true" SECURITY_ALLOWUPPERPATHS: "true" RESPONSE_ERROR_STACK: "true" + # Mache BGE-M3 systemweit verfuegbar (und damit default, wenn sonst nichts angegeben ist) + EMBEDDING_PROVIDERS: bge-m3 + # Die BGE-M3 spezifischen Configurations-Sets + EMBEDDING_BGE-M3_TYPE: native + EMBEDDING_BGE-M3_MODELPATH: /models/bge-m3 + EMBEDDING_BGE-M3_DIMENSIONS: 1024 + # Die Vektor Prefixe, die dann in Vector-Collections als Default greifen + EMBEDDING_BGE-M3_DOCUMENTPREFIX: "search_document: " + EMBEDDING_BGE-M3_QUERYPREFIX: "search_query: " depends_on: - mongo expose: