✨ feat: enhance search capabilities and indexing across collections

2026-05-17 20:33:22 +00:00
parent 8cbf0db14f
commit 2d52272b2e
8 changed files with 182 additions and 12 deletions
@@ -79,6 +79,7 @@ Tibi supports multiple search modes via collection `search:` config:
 - `filter`
 - `ngram`
 - `vector`
+- `combined`

 Use explicit search configs when search is a real product feature. Auto-fallback is useful, but it is not a substitute for a deliberate retrieval model.

@@ -92,7 +93,7 @@ Use when:
 - exact field ownership of the text index is clear
 - keyword search is enough

-Requires a text index.
+Requires a MongoDB text index (`$text: $**` or specific).

 ### `regex`

@@ -100,9 +101,16 @@ Use when:

 - the searchable fields are explicit
 - case-insensitive matching is enough
- weighted field scoring is useful
+- weighted field scoring is useful (via `regex.weights: { "meta.title": 10, path: 5 }`)

-Good for smaller datasets or precise keyed fields.
+Good for smaller datasets or precise keyed fields. Very easy to configure without external dependencies. Example:
+
+```yaml
+search:
+    - name: default
+      mode: regex
+      fields: [title, "alt.de", description]
+```

 ### `filter` or `eval`

@@ -121,23 +129,62 @@ Use when:
 - users search codes, names, transliterated terms, or partial inputs

 This is enrichment-based search. It stores generated `_search` data and benefits from clear regeneration expectations.
+_Note:_ Field weighting is not natively supported inside a single `ngram` mode, because all `fields` are concatenated into one large ngram index block per document.

 ### `vector`

 Use when:

 - semantic similarity matters more than literal keyword overlap
- the project can support embedding-provider setup and operator cost expectations
+- the project can support embedding-provider setup (e.g. `bge-m3` in `api/config.yml`)
 - search quality justifies added complexity

-Vector mode can use:
+Vector mode requires a registered provider.

- `fields`
- custom `eval` transformation
- `documentPrefix`
- `queryPrefix`
- `overflow: truncate|chunk`
- `rrf` tuning for hybrid scoring
+### `combined` (RRF)
+
+Use when:
+
+- Hybrid search is required (e.g. `vector` + `ngram` to catch typos and semantic meaning).
+- You need to simulate field-weighting for `vector` or `ngram` by breaking them up into multiple search blocks and fusing them with different weights.
+
+`mode: combined` uses Reciprocal Rank Fusion (RRF). It delegates execution to other configured search blocks (which should be hidden in admin UI via `meta.hide: true`).
+
+**Field-Weighting Workaround with combined:**
+Because `vector` and `ngram` concatenate all fields, you can weight highly important fields (like titles) higher than deep content fields by creating multiple ngram/vector blocks and boosting the important one in the `combined` weights:
+
+```yaml
+search:
+    - name: main_search
+      mode: combined
+      rrf:
+          k: 60
+          topK: 100
+          weights:
+              semantic: 1.5
+              fuzzy_important: 2.0 # Boosts matches in title/headline
+              fuzzy_content: 0.5 # Lowers weight for deep text matches
+      meta:
+          label: { de: "Suche", en: "Search" }
+
+    - name: fuzzy_important
+      mode: ngram
+      fields: [name, "meta.title", "blocks.headline"]
+      autoRegenerate: true
+      meta: { hide: true }
+
+    - name: fuzzy_content
+      mode: ngram
+      fields: ["blocks.text", "blocks.items.answer"]
+      autoRegenerate: true
+      meta: { hide: true }
+
+    - name: semantic
+      mode: vector
+      fields: [name, "meta.title", "blocks.text"]
+      vector: { provider: bge-m3 }
+      autoRegenerate: true
+```

 ## Auto-regeneration and admin flows

@@ -32,6 +32,18 @@ hooks:
      type: javascript
      file: hooks/filter_public.js

+indexes:
+  - name: content_active
+    key:
+      - contentId
+      - active
+    background: true
+
+search:
+  - name: default
+    mode: regex
+    fields: [author, message, contentId]
+
 fields:
  - name: active
    type: boolean
@@ -52,6 +52,77 @@ permissions:
      put: true
      delete: true

+indexes:
+  - name: content_text
+    key:
+      - "$text:$**"
+    defaultLanguage: none
+    background: true
+  - name: path_lang
+    key:
+      - path
+      - lang
+    unique: true
+    background: true
+  - name: translation
+    key:
+      - translationKey
+      - lang
+    unique: true
+    background: true
+
+search:
+  - name: combined
+    mode: combined
+    rrf:
+      k: 60
+      topK: 100
+      weights:
+        semantic: 1.5
+        fuzzy: 1.0
+    meta:
+      label: { de: "Suche (Kombiniert)", en: "Search (Combined)" }
+
+  - name: fuzzy
+    mode: ngram
+    autoRegenerate: true
+    fields:
+      [
+        name,
+        path,
+        "meta.title",
+        "meta.description",
+        "blocks.headline",
+        "blocks.subline",
+        "blocks.text",
+        "blocks.items.title",
+        "blocks.items.text",
+        "blocks.items.question",
+        "blocks.items.answer",
+      ]
+    meta:
+      label: { de: "Fuzzy (n-gram)", en: "Fuzzy (n-gram)" }
+
+  - name: semantic
+    mode: vector
+    autoRegenerate: true
+    fields:
+      [
+        name,
+        "meta.title",
+        "meta.description",
+        "blocks.headline",
+        "blocks.subline",
+        "blocks.text",
+        "blocks.items.title",
+        "blocks.items.text",
+        "blocks.items.question",
+        "blocks.items.answer",
+      ]
+    vector: { provider: bge-m3 }
+    meta:
+      label: { de: "Semantisch (Vektor)", en: "Semantic (Vector)" }
+
 imageFilter: !include lib/imageFilter.yml

 fields:
@@ -58,6 +58,11 @@ permissions:

 imageFilter: !include lib/imageFilter.yml

+search:
+  - name: default
+    mode: regex
+    fields: [title, "alt.de", "alt.en", description]
+
 fields:
  - name: file
    type: file
@@ -58,6 +58,19 @@ permissions:
      put: true
      delete: true

+indexes:
+  - name: type_language
+    key:
+      - type
+      - language
+    unique: true
+    background: true
+
+search:
+  - name: default
+    mode: regex
+    fields: [type, "elements.name", "elements.page"]
+
 fields:
  - name: language
    type: string
@@ -65,6 +65,18 @@ permissions:
      put: true
      delete: true

+indexes:
+  - name: name_unique
+    key:
+      - name
+    unique: true
+    background: true
+
+search:
+  - name: default
+    mode: regex
+    fields: [name, group]
+
 fields:
  - name: name
    type: string
@@ -1,2 +1,2 @@
 ADMIN_TOKEN=5bdfjc78hdxn338cuhSJ
-ADMIN_ASSET_VERSION=f407946-dirty-1779042833381
+ADMIN_ASSET_VERSION=8cbf0db-dirty-1779049064994
@@ -40,6 +40,7 @@ services:
        image: gitbase.de/cms/tibi-server:nova
        volumes:
            - ./:/data
+            - /WM_Dev/models:/models
        environment:
            DB_DIAL: mongodb://mongo
            DB_PREFIX: ${TIBI_PREFIX}
@@ -47,6 +48,15 @@ services:
            SECURITY_ALLOWABSOLUTEPATHS: "true"
            SECURITY_ALLOWUPPERPATHS: "true"
            RESPONSE_ERROR_STACK: "true"
+            # Mache BGE-M3 systemweit verfuegbar (und damit default, wenn sonst nichts angegeben ist)
+            EMBEDDING_PROVIDERS: bge-m3
+            # Die BGE-M3 spezifischen Configurations-Sets
+            EMBEDDING_BGE-M3_TYPE: native
+            EMBEDDING_BGE-M3_MODELPATH: /models/bge-m3
+            EMBEDDING_BGE-M3_DIMENSIONS: 1024
+            # Die Vektor Prefixe, die dann in Vector-Collections als Default greifen
+            EMBEDDING_BGE-M3_DOCUMENTPREFIX: "search_document: "
+            EMBEDDING_BGE-M3_QUERYPREFIX: "search_query: "
        depends_on:
            - mongo
        expose: