From 50e80e1adce6bd93e88250ca3dae0b6bd03b5b2b Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 12:44:04 +0200
Subject: [PATCH 1/7] feat: LFM2.5 text-embedding + ColBERT (MLX/XNNPACK) with
 prompts & MaxSim
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the LFM2.5-Embedding-350M and LFM2.5-ColBERT-350M models, served from
HuggingFace (MLX on iOS, XNNPACK on Android / iOS simulator).

Text embeddings are unified into one runner and one hook: the native
TextEmbeddings model returns the raw [numTokens, embeddingDim] matrix
(numTokens === 1 for pooled models, the full sequence for multi-vector /
late-interaction models like ColBERT), plus the input token ids. The TS
layer reduces it — toVector() for the single-vector case, getTokenVectors()
and maxSim() for late interaction.

Models trained with asymmetric query/document prompts (LFM uses query:/
document:, ColBERT uses [Q] /[D] ) carry a "prompts" config; forward then
requires a role argument ('query' | 'document') that auto-prepends the
prompt. The role is type-enforced: required for prompted models, forbidden
for plain ones.

Also: tokenizer post_processor is now applied for text embeddings so the
BOS special token is added (CLS-pooled models depend on it), and the
text-to-image Encoder reads the new EmbeddingResult.

Example app gains a semantic-search screen and a ColBERT late-interaction
search screen demonstrating MaxSim.

Authored with Claude.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/text-embeddings/app/_layout.tsx          |   8 +
 .../app/clip-embeddings/index.tsx             |   3 +-
 apps/text-embeddings/app/colbert/index.tsx    | 289 ++++++++++++
 .../app/text-embeddings/index.tsx             | 439 ++++++++++--------
 .../common/rnexecutorch/TokenizerModule.cpp   |  19 +
 .../common/rnexecutorch/TokenizerModule.h     |   6 +
 .../host_objects/JsiConversions.h             |  30 ++
 .../rnexecutorch/models/embeddings/Types.h    |  23 +
 .../models/embeddings/text/TextEmbeddings.cpp |  22 +-
 .../models/embeddings/text/TextEmbeddings.h   |   8 +-
 .../models/text_to_image/Encoder.cpp          |   7 +-
 .../src/constants/modelRegistry.ts            |  58 +++
 .../src/constants/modelUrls.ts                |  15 +
 .../useTextEmbeddings.ts                      |  18 +-
 packages/react-native-executorch/src/index.ts |   1 +
 .../TextEmbeddingsModule.ts                   |  64 ++-
 .../src/types/textEmbeddings.ts               | 129 +++--
 .../src/utils/textEmbeddings.ts               |  74 +++
 18 files changed, 938 insertions(+), 275 deletions(-)
 create mode 100644 apps/text-embeddings/app/colbert/index.tsx
 create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/embeddings/Types.h
 create mode 100644 packages/react-native-executorch/src/utils/textEmbeddings.ts
diff --git a/apps/text-embeddings/app/_layout.tsx b/apps/text-embeddings/app/_layout.tsx
index bb8e1deeb8..57acb26eb2 100644
--- a/apps/text-embeddings/app/_layout.tsx
+++ b/apps/text-embeddings/app/_layout.tsx
@@ -109,6 +109,14 @@ export default function _layout() {
             headerTitleStyle: { color: ColorPalette.primary },
           }}
         />
+        <Drawer.Screen
+          name="colbert/index"
+          options={{
+            drawerLabel: 'ColBERT search',
+            title: 'ColBERT search',
+            headerTitleStyle: { color: ColorPalette.primary },
+          }}
+        />
       </Drawer>
     </GeneratingContext>
   );
diff --git a/apps/text-embeddings/app/clip-embeddings/index.tsx b/apps/text-embeddings/app/clip-embeddings/index.tsx
index 02a8a9c656..e0232d3440 100644
--- a/apps/text-embeddings/app/clip-embeddings/index.tsx
+++ b/apps/text-embeddings/app/clip-embeddings/index.tsx
@@ -16,6 +16,7 @@ import {
   models,
   useTextEmbeddings,
   useImageEmbeddings,
+  toVector,
   ImageEmbeddingsProps,
 } from 'react-native-executorch';
 
@@ -101,7 +102,7 @@ function ClipEmbeddingsScreen() {
       const txtStart = Date.now();
       const scored: { label: string; similarity: number }[] = [];
       for (const label of labels) {
-        const textEmbedding = await textModel.forward(label);
+        const textEmbedding = toVector(await textModel.forward(label));
         scored.push({
           label,
           similarity: dotProduct(imageEmbedding, textEmbedding),
diff --git a/apps/text-embeddings/app/colbert/index.tsx b/apps/text-embeddings/app/colbert/index.tsx
new file mode 100644
index 0000000000..d686168f43
--- /dev/null
+++ b/apps/text-embeddings/app/colbert/index.tsx
@@ -0,0 +1,289 @@
+import { useEffect, useState } from 'react';
+import {
+  StyleSheet,
+  Text,
+  TextInput,
+  TouchableOpacity,
+  View,
+  SafeAreaView,
+  ScrollView,
+  KeyboardAvoidingView,
+  Platform,
+} from 'react-native';
+import { Ionicons } from '@expo/vector-icons';
+import { useIsFocused } from 'expo-router';
+import {
+  models,
+  useTextEmbeddings,
+  maxSim,
+  EmbeddingResult,
+} from 'react-native-executorch';
+import ColorPalette from '../../colors';
+import ErrorBanner from '../../components/ErrorBanner';
+
+const colbertModel = models.text_embedding.lfm2_5_colbert_350m();
+
+// The library auto-applies the model's [Q]/[D] prompts via forward(text, role).
+// Late-interaction MaxSim is a shipped util; the document skiplist (punctuation
+// token ids excluded from scoring) is the consumer's choice — these are the
+// LFM2.5-ColBERT skiplist ids.
+const SKIPLIST = [
+  510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
+  535, 536, 537, 538, 539, 540, 541, 568, 569, 570, 571, 572, 573, 600, 601,
+  602, 603,
+];
+
+const CORPUS: string[] = [
+  'The forecast says heavy showers this afternoon.',
+  "It's so sunny outside today!",
+  'The home team scored in the final minute to win the match.',
+  'Fans packed the stadium for the championship game.',
+  'Simmer the tomatoes with garlic before adding the pasta.',
+  'He whisked the eggs and folded in the melted chocolate.',
+  'The new phone has a faster chip and a brighter screen.',
+  'The flight to Tokyo was delayed by three hours.',
+  'We hiked along the coast and camped near the cliffs.',
+];
+
+const EXAMPLE_QUERIES: string[] = [
+  "What's the weather like?",
+  'Who won the match?',
+  'How do I cook dinner?',
+  'Tell me about the latest technology',
+];
+
+type Ranked = { sentence: string; score: number };
+
+export default function ColbertScreenWrapper() {
+  return useIsFocused() ? <ColbertScreen /> : null;
+}
+
+function ColbertScreen() {
+  const model = useTextEmbeddings({ model: colbertModel });
+  const [error, setError] = useState<string | null>(null);
+  const [query, setQuery] = useState('');
+  const [docEncs, setDocEncs] = useState<
+    { sentence: string; enc: EmbeddingResult }[]
+  >([]);
+  const [results, setResults] = useState<Ranked[]>([]);
+  const [indexing, setIndexing] = useState(false);
+  const [encodeTime, setEncodeTime] = useState<number | null>(null);
+
+  useEffect(
+    () => {
+      let cancelled = false;
+      const indexCorpus = async () => {
+        if (!model.isReady) return;
+        setIndexing(true);
+        setResults([]);
+        try {
+          const encs = [];
+          for (const sentence of CORPUS) {
+            const enc = await model.forward(sentence, 'document');
+            if (cancelled) return;
+            encs.push({ sentence, enc });
+          }
+          setDocEncs(encs);
+        } catch (e) {
+          setError(e instanceof Error ? e.message : String(e));
+        } finally {
+          if (!cancelled) setIndexing(false);
+        }
+      };
+      indexCorpus();
+      return () => {
+        cancelled = true;
+      };
+    },
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+    [model.isReady]
+  );
+
+  const runSearch = async (queryText: string = query) => {
+    const q = queryText.trim();
+    if (!model.isReady || !q || docEncs.length === 0) return;
+    setQuery(queryText);
+    try {
+      const start = Date.now();
+      const qEnc = await model.forward(q, 'query');
+      setEncodeTime(Date.now() - start);
+      const ranked = docEncs
+        .map(({ sentence, enc }) => ({
+          sentence,
+          score: maxSim(qEnc, enc, SKIPLIST),
+        }))
+        .sort((a, b) => b.score - a.score);
+      setResults(ranked);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    }
+  };
+
+  const ready = model.isReady && !indexing && docEncs.length > 0;
+  const canSearch = ready && !!query.trim();
+
+  const statusText = model.error
+    ? `Error: ${model.error}`
+    : !model.isReady
+      ? `Loading model ${(model.downloadProgress * 100).toFixed(0)}%`
+      : indexing
+        ? 'Indexing corpus…'
+        : 'Ready';
+
+  return (
+    <SafeAreaView style={styles.container}>
+      <KeyboardAvoidingView
+        style={styles.flex}
+        behavior={Platform.OS === 'ios' ? 'padding' : undefined}
+      >
+        <ScrollView contentContainerStyle={styles.scroll}>
+          <Text style={styles.heading}>ColBERT Late-Interaction Search</Text>
+          <Text style={styles.status}>{statusText}</Text>
+          <ErrorBanner message={error} onDismiss={() => setError(null)} />
+
+          <View style={styles.card}>
+            <Text style={styles.sectionTitle}>
+              Search the corpus ({CORPUS.length} sentences)
+            </Text>
+            <Text style={styles.hint}>
+              Per-token vectors scored with MaxSim. Tap an example or type a
+              query.
+            </Text>
+            <View style={styles.chipRow}>
+              {EXAMPLE_QUERIES.map((q) => (
+                <TouchableOpacity
+                  key={q}
+                  style={[styles.chip, !ready && styles.chipDisabled]}
+                  disabled={!ready}
+                  onPress={() => runSearch(q)}
+                >
+                  <Text style={styles.chipText}>{q}</Text>
+                </TouchableOpacity>
+              ))}
+            </View>
+            <TextInput
+              placeholder="Type a search query..."
+              placeholderTextColor="#94A3B8"
+              style={styles.input}
+              value={query}
+              onChangeText={setQuery}
+              onSubmitEditing={() => runSearch()}
+              returnKeyType="search"
+            />
+            <TouchableOpacity
+              onPress={() => runSearch()}
+              style={[styles.button, !canSearch && styles.buttonDisabled]}
+              disabled={!canSearch}
+            >
+              <Ionicons
+                name="search"
+                size={16}
+                color={!canSearch ? 'gray' : 'white'}
+              />
+              <Text style={[styles.buttonText, !canSearch && styles.buttonTextDisabled]}>
+                {indexing ? 'Indexing…' : 'Search'}
+              </Text>
+            </TouchableOpacity>
+            {encodeTime !== null && (
+              <Text style={styles.stats}>Query encoded in {encodeTime} ms</Text>
+            )}
+          </View>
+
+          {results.length > 0 && (
+            <View style={styles.card}>
+              <Text style={styles.sectionTitle}>Results</Text>
+              {results.map((r, i) => (
+                <View key={i} style={styles.resultRow}>
+                  <View style={styles.resultHeader}>
+                    <Text style={styles.resultText}>{r.sentence}</Text>
+                    <Text style={styles.resultScore}>{r.score.toFixed(2)}</Text>
+                  </View>
+                  <View style={styles.barTrack}>
+                    <View
+                      style={[
+                        styles.barFill,
+                        {
+                          width: `${Math.round(
+                            (results[0].score > 0 ? r.score / results[0].score : 0) * 100
+                          )}%`,
+                        },
+                        i === 0 && styles.barFillTop,
+                      ]}
+                    />
+                  </View>
+                </View>
+              ))}
+            </View>
+          )}
+        </ScrollView>
+      </KeyboardAvoidingView>
+    </SafeAreaView>
+  );
+}
+
+const styles = StyleSheet.create({
+  container: { flex: 1, backgroundColor: '#F8FAFC' },
+  flex: { flex: 1 },
+  scroll: { padding: 20 },
+  heading: { fontSize: 22, fontWeight: '500', marginBottom: 8, color: '#0F172A' },
+  status: { fontSize: 14, color: '#64748B', marginBottom: 12 },
+  card: {
+    backgroundColor: '#fff',
+    padding: 16,
+    borderRadius: 16,
+    borderColor: '#E2E8F0',
+    borderWidth: 2,
+    marginBottom: 20,
+  },
+  sectionTitle: { fontSize: 16, fontWeight: '500', marginBottom: 8, color: '#1E293B' },
+  hint: { fontSize: 13, color: '#64748B', marginBottom: 12, lineHeight: 18 },
+  chipRow: { flexDirection: 'row', flexWrap: 'wrap', gap: 8, marginBottom: 12 },
+  chip: {
+    backgroundColor: '#EEF2FF',
+    borderColor: '#C7D2FE',
+    borderWidth: 1,
+    borderRadius: 16,
+    paddingHorizontal: 12,
+    paddingVertical: 6,
+  },
+  chipDisabled: { opacity: 0.4 },
+  chipText: { fontSize: 13, color: 'navy' },
+  input: {
+    backgroundColor: '#F1F5F9',
+    borderRadius: 10,
+    padding: 10,
+    marginBottom: 10,
+    fontSize: 16,
+    color: '#0F172A',
+    minHeight: 40,
+  },
+  button: {
+    backgroundColor: 'navy',
+    borderRadius: 10,
+    paddingVertical: 12,
+    flexDirection: 'row',
+    alignItems: 'center',
+    justifyContent: 'center',
+  },
+  buttonDisabled: { backgroundColor: '#f0f0f0' },
+  buttonText: { color: '#fff', fontWeight: '500', marginLeft: 6 },
+  buttonTextDisabled: { color: 'gray' },
+  stats: { fontSize: 13, color: '#64748B', marginTop: 8, textAlign: 'center' },
+  resultRow: { marginBottom: 14 },
+  resultHeader: {
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+    marginBottom: 6,
+    gap: 8,
+  },
+  resultText: { flex: 1, fontSize: 14, color: '#334155' },
+  resultScore: {
+    fontSize: 14,
+    fontWeight: '600',
+    color: '#0F172A',
+    fontVariant: ['tabular-nums'],
+  },
+  barTrack: { height: 8, borderRadius: 4, backgroundColor: '#E2E8F0', overflow: 'hidden' },
+  barFill: { height: '100%', borderRadius: 4, backgroundColor: '#94A3B8' },
+  barFillTop: { backgroundColor: 'navy' },
+});
diff --git a/apps/text-embeddings/app/text-embeddings/index.tsx b/apps/text-embeddings/app/text-embeddings/index.tsx
index 88e39ce063..470094da02 100644
--- a/apps/text-embeddings/app/text-embeddings/index.tsx
+++ b/apps/text-embeddings/app/text-embeddings/index.tsx
@@ -15,10 +15,13 @@ import { ModelPicker } from '../../components/ModelPicker';
 import {
   models,
   useTextEmbeddings,
+  toVector,
   TextEmbeddingsProps,
 } from 'react-native-executorch';
 const textEmbedding = models.text_embedding;
 
+// Single-vector (pooled) models: forward() returns the raw result; toVector()
+// gives the single embedding. The multi-vector ColBERT model has its own screen.
 type TextEmbeddingModel = TextEmbeddingsProps['model'];
 
 const MODELS: { label: string; value: TextEmbeddingModel }[] = [
@@ -43,6 +46,42 @@ const MODELS: { label: string; value: TextEmbeddingModel }[] = [
     label: 'Multilingual Paraphrase',
     value: textEmbedding.paraphrase_multilingual_minilm_l12_v2(),
   },
+  {
+    label: 'LFM2.5 Embedding XNNPACK',
+    value: textEmbedding.lfm2_5_embedding_350m({ backend: 'xnnpack' }),
+  },
+  {
+    label: 'LFM2.5 Embedding MLX',
+    value: textEmbedding.lfm2_5_embedding_350m({ backend: 'mlx' }),
+  },
+];
+
+// A multi-topic corpus so semantic ranking is visible: a weather query should
+// float the weather lines to the top and push sports/cooking/tech down, even
+// with no shared keywords.
+const CORPUS: string[] = [
+  'The forecast says heavy showers this afternoon.',
+  "It's so sunny outside today!",
+  'A thick fog rolled in over the harbor at dawn.',
+  'The home team scored in the final minute to win the match.',
+  'She sprinted the last lap and broke the national record.',
+  'Fans packed the stadium for the championship game.',
+  'Simmer the tomatoes with garlic before adding the pasta.',
+  'He whisked the eggs and folded in the melted chocolate.',
+  'The new phone has a faster chip and a brighter screen.',
+  'Our servers crashed under the sudden spike in traffic.',
+  'The flight to Tokyo was delayed by three hours.',
+  'We hiked along the coast and camped near the cliffs.',
+];
+
+// Tap-to-run example queries. Natural-language questions — how these models
+// are trained to be queried — give the cleanest separation.
+const EXAMPLE_QUERIES: string[] = [
+  "What's the weather like?",
+  'Who won the match?',
+  'Tell me about the latest technology',
+  'How do I cook dinner?',
+  'Where did they travel?',
 ];
 import { useIsFocused } from 'expo-router';
 import { dotProduct } from '../../utils/math';
@@ -54,6 +93,8 @@ export default function TextEmbeddingsScreenWrapper() {
   return isFocused ? <TextEmbeddingsScreen /> : null;
 }
 
+type RankedResult = { sentence: string; similarity: number };
+
 function TextEmbeddingsScreen() {
   const [selectedModel, setSelectedModel] = useState<TextEmbeddingModel>(
     textEmbedding.all_minilm_l6_v2()
@@ -61,88 +102,70 @@ function TextEmbeddingsScreen() {
   const model = useTextEmbeddings({ model: selectedModel });
   const [error, setError] = useState<string | null>(null);
 
-  const [inputSentence, setInputSentence] = useState('');
-  const [sentencesWithEmbeddings, setSentencesWithEmbeddings] = useState<
+  const [query, setQuery] = useState('');
+  const [corpusEmbeddings, setCorpusEmbeddings] = useState<
     { sentence: string; embedding: Float32Array }[]
   >([]);
-  const [topMatches, setTopMatches] = useState<
-    { sentence: string; similarity: number }[]
-  >([]);
+  const [results, setResults] = useState<RankedResult[]>([]);
   const [embeddingTime, setEmbeddingTime] = useState<number | null>(null);
+  const [indexing, setIndexing] = useState(false);
 
+  // Embed the whole corpus once the model is ready (re-runs on model change so
+  // prefixes / weights match the active model).
   useEffect(
     () => {
-      const computeEmbeddings = async () => {
+      let cancelled = false;
+      const indexCorpus = async () => {
         if (!model.isReady) return;
-
-        const sentences = [
-          'The weather is lovely today.',
-          "It's so sunny outside!",
-          'He drove to the stadium.',
-        ];
-
+        setIndexing(true);
+        setResults([]);
         try {
-          const embeddings = [];
-          for (const sentence of sentences) {
-            const embedding = await model.forward(sentence);
-            embeddings.push({ sentence, embedding });
+          const embedded = [];
+          for (const sentence of CORPUS) {
+            // forward(_, 'document') auto-applies the model's document prompt
+            // (a no-op for models without one).
+            const embedding = toVector(
+              await model.forward(sentence, 'document')
+            );
+            if (cancelled) return;
+            embedded.push({ sentence, embedding });
           }
-
-          setSentencesWithEmbeddings(embeddings);
-        } catch (e) {
-          setError(e instanceof Error ? e.message : String(e));
+          setCorpusEmbeddings(embedded);
+        } catch {
+          // A transient "Model not loaded" can fire while the hook swaps
+          // models; the effect re-runs once the new model is ready.
+        } finally {
+          if (!cancelled) setIndexing(false);
         }
       };
-
-      computeEmbeddings();
+      indexCorpus();
+      return () => {
+        cancelled = true;
+      };
     },
+    // Re-index when the model becomes ready OR the selected model changes, so
+    // the corpus is embedded by the active model. The "Model not loaded" race
+    // is handled by the isReady gate plus clearing the corpus on switch;
+    // switching sets isReady false→true so the re-run sees the new model.
     // eslint-disable-next-line react-hooks/exhaustive-deps
-    [model.isReady]
+    [model.isReady, selectedModel]
   );
 
-  const checkSimilarities = async () => {
-    if (!model.isReady || !inputSentence.trim()) return;
-
+  const runSearch = async (queryText: string = query) => {
+    const q = queryText.trim();
+    if (!model.isReady || !q || corpusEmbeddings.length === 0) return;
+    setQuery(queryText);
     try {
       const start = Date.now();
-      const inputEmbedding = await model.forward(inputSentence);
+      const queryEmbedding = toVector(await model.forward(q, 'query'));
       setEmbeddingTime(Date.now() - start);
-      const matches = sentencesWithEmbeddings.map(
-        ({ sentence, embedding }) => ({
+      const ranked = corpusEmbeddings
+        .map(({ sentence, embedding }) => ({
           sentence,
-          similarity: dotProduct(inputEmbedding, embedding),
-        })
-      );
-      matches.sort((a, b) => b.similarity - a.similarity);
-      setTopMatches(matches.slice(0, 3));
-    } catch (e) {
-      setError(e instanceof Error ? e.message : String(e));
-    }
-  };
-
-  const addToSentences = async () => {
-    if (!model.isReady || !inputSentence.trim()) return;
-
-    try {
-      const start = Date.now();
-      const embedding = await model.forward(inputSentence);
-      setEmbeddingTime(Date.now() - start);
-      setSentencesWithEmbeddings((prev) => [
-        ...prev,
-        { sentence: inputSentence, embedding },
-      ]);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : String(e));
-    }
-
-    setInputSentence('');
-    setTopMatches([]);
-  };
-
-  const clearList = async () => {
-    if (!model.isReady) return;
-    try {
-      setSentencesWithEmbeddings([]);
+          similarity: dotProduct(queryEmbedding, embedding),
+        }))
+        .sort((a, b) => b.similarity - a.similarity);
+      setResults(ranked);
     } catch (e) {
       setError(e instanceof Error ? e.message : String(e));
     }
@@ -158,6 +181,11 @@ function TextEmbeddingsScreen() {
     return model.isGenerating ? 'Generating...' : 'Model is ready';
   };
 
+  // Chips/examples just need a ready, indexed model; the Search button also
+  // needs a non-empty typed query.
+  const ready = model.isReady && !indexing && corpusEmbeddings.length > 0;
+  const canSearch = ready && !!query.trim();
+
   return (
     <SafeAreaView style={styles.container}>
       <KeyboardAvoidingView
@@ -165,133 +193,131 @@ function TextEmbeddingsScreen() {
         behavior={Platform.OS === 'ios' ? 'padding' : undefined}
       >
         <ScrollView contentContainerStyle={styles.scrollContainer}>
-          <Text style={styles.heading}>Text Embeddings Playground</Text>
+          <Text style={styles.heading}>Semantic Search</Text>
           <Text style={styles.sectionTitle}>{getModelStatusText()}</Text>
           <ModelPicker
             models={MODELS}
             selectedModel={selectedModel}
             onSelect={(m) => {
               setSelectedModel(m);
-              setSentencesWithEmbeddings([]);
-              setTopMatches([]);
+              setCorpusEmbeddings([]);
+              setResults([]);
+              setQuery('');
             }}
           />
           <ErrorBanner message={error} onDismiss={() => setError(null)} />
 
           <View style={styles.card}>
-            <Text style={styles.sectionTitle}>List of Existing Sentences</Text>
-            {sentencesWithEmbeddings.map((item, index) => (
-              <Text key={index} style={styles.sentenceText}>
-                - {item.sentence}
-              </Text>
-            ))}
-          </View>
-          <View style={styles.card}>
-            <Text style={styles.sectionTitle}>Try Your Sentence</Text>
+            <Text style={styles.sectionTitle}>
+              Search the corpus ({CORPUS.length} sentences)
+            </Text>
+            <Text style={styles.hint}>
+              Ranks every sentence by meaning. Ask a full question — tap an
+              example or type your own.
+            </Text>
+            <View style={styles.chipRow}>
+              {EXAMPLE_QUERIES.map((q) => (
+                <TouchableOpacity
+                  key={q}
+                  style={[styles.chip, !ready && styles.chipDisabled]}
+                  disabled={!ready}
+                  onPress={() => runSearch(q)}
+                >
+                  <Text style={styles.chipText}>{q}</Text>
+                </TouchableOpacity>
+              ))}
+            </View>
             <TextInput
-              placeholder="Type your sentence here..."
+              placeholder="Type a search query..."
               placeholderTextColor="#94A3B8"
               style={styles.input}
-              value={inputSentence}
-              onChangeText={setInputSentence}
-              multiline
+              value={query}
+              onChangeText={setQuery}
+              onSubmitEditing={() => runSearch()}
+              returnKeyType="search"
             />
-            <View style={styles.buttonContainer}>
-              <TouchableOpacity
-                onPress={checkSimilarities}
+            <TouchableOpacity
+              onPress={() => runSearch()}
+              style={[
+                styles.buttonPrimary,
+                !canSearch && styles.buttonDisabled,
+              ]}
+              disabled={!canSearch}
+            >
+              <Ionicons
+                name="search"
+                size={16}
+                color={!canSearch ? 'gray' : 'white'}
+              />
+              <Text
                 style={[
-                  styles.buttonPrimary,
-                  !inputSentence && styles.buttonDisabled,
+                  styles.buttonText,
+                  !canSearch && styles.buttonTextDisabled,
                 ]}
-                disabled={!inputSentence}
               >
-                <Ionicons
-                  name="search"
-                  size={16}
-                  color={!inputSentence ? 'gray' : 'white'}
-                />
-                <Text
-                  style={[
-                    styles.buttonText,
-                    !inputSentence && styles.buttonTextDisabled,
-                  ]}
-                >
-                  Find Similar
-                </Text>
-              </TouchableOpacity>
-              <View style={styles.buttonGroup}>
-                <TouchableOpacity
-                  onPress={addToSentences}
-                  style={[
-                    styles.buttonSecondary,
-                    !inputSentence && styles.buttonDisabled,
-                  ]}
-                  disabled={!inputSentence}
-                >
-                  <Ionicons
-                    name="add-circle-outline"
-                    size={16}
-                    color={!inputSentence ? 'gray' : 'navy'}
-                  />
-                  <Text
-                    style={[
-                      styles.buttonTextOutline,
-                      !inputSentence && styles.buttonTextDisabled,
-                    ]}
-                  >
-                    Add to List
-                  </Text>
-                </TouchableOpacity>
-                <TouchableOpacity
-                  onPress={clearList}
-                  style={[
-                    styles.buttonSecondary,
-                    sentencesWithEmbeddings.length === 0 &&
-                      styles.buttonDisabled,
-                  ]}
-                  disabled={sentencesWithEmbeddings.length === 0}
-                >
-                  <Ionicons
-                    name="close-outline"
-                    size={16}
-                    color={
-                      sentencesWithEmbeddings.length === 0 ? 'gray' : 'navy'
-                    }
-                  />
-                  <Text
-                    style={[
-                      styles.buttonTextOutline,
-                      sentencesWithEmbeddings.length === 0 &&
-                        styles.buttonTextDisabled,
-                    ]}
-                  >
-                    Clear List
-                  </Text>
-                </TouchableOpacity>
-              </View>
-            </View>
+                {indexing ? 'Indexing corpus…' : 'Search'}
+              </Text>
+            </TouchableOpacity>
             {embeddingTime !== null && (
               <Text style={styles.statsText}>
-                Embedding time: {embeddingTime} ms
+                Query embedded in {embeddingTime} ms
               </Text>
             )}
-            {topMatches.length > 0 && (
-              <View style={styles.topMatchesContainer}>
-                <Text style={styles.sectionTitle}>Top Matches</Text>
-                {topMatches.map((item, index) => (
-                  <Text key={index} style={styles.sentenceText}>
-                    {item.sentence} ({item.similarity.toFixed(2)})
-                  </Text>
-                ))}
-              </View>
-            )}
           </View>
+
+          {results.length > 0 && (
+            <View style={styles.card}>
+              <Text style={styles.sectionTitle}>Results</Text>
+              {results.map((item, index) => (
+                <ResultRow
+                  key={index}
+                  sentence={item.sentence}
+                  similarity={item.similarity}
+                  best={results[0].similarity}
+                  rank={index}
+                />
+              ))}
+            </View>
+          )}
         </ScrollView>
       </KeyboardAvoidingView>
     </SafeAreaView>
   );
 }
 
+// One ranked result with a similarity bar. The bar is scaled relative to the
+// top hit so the ranking is visually obvious; the raw cosine is shown too.
+function ResultRow({
+  sentence,
+  similarity,
+  best,
+  rank,
+}: {
+  sentence: string;
+  similarity: number;
+  best: number;
+  rank: number;
+}) {
+  const fraction = best > 0 ? Math.max(0, similarity / best) : 0;
+  return (
+    <View style={styles.resultRow}>
+      <View style={styles.resultHeader}>
+        <Text style={styles.resultText}>{sentence}</Text>
+        <Text style={styles.resultScore}>{similarity.toFixed(2)}</Text>
+      </View>
+      <View style={styles.barTrack}>
+        <View
+          style={[
+            styles.barFill,
+            { width: `${Math.round(fraction * 100)}%` },
+            rank === 0 && styles.barFillTop,
+          ]}
+        />
+      </View>
+    </View>
+  );
+}
+
 const styles = StyleSheet.create({
   container: {
     flex: 1,
@@ -323,11 +349,68 @@ const styles = StyleSheet.create({
     marginBottom: 12,
     color: '#1E293B',
   },
-  sentenceText: {
-    fontSize: 14,
+  hint: {
+    fontSize: 13,
+    color: '#64748B',
+    marginBottom: 12,
+    lineHeight: 18,
+  },
+  chipRow: {
+    flexDirection: 'row',
+    flexWrap: 'wrap',
+    gap: 8,
+    marginBottom: 12,
+  },
+  chip: {
+    backgroundColor: '#EEF2FF',
+    borderColor: '#C7D2FE',
+    borderWidth: 1,
+    borderRadius: 16,
+    paddingHorizontal: 12,
+    paddingVertical: 6,
+  },
+  chipDisabled: {
+    opacity: 0.4,
+  },
+  chipText: {
+    fontSize: 13,
+    color: 'navy',
+  },
+  resultRow: {
+    marginBottom: 14,
+  },
+  resultHeader: {
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+    alignItems: 'flex-start',
     marginBottom: 6,
+    gap: 8,
+  },
+  resultText: {
+    flex: 1,
+    fontSize: 14,
     color: '#334155',
   },
+  resultScore: {
+    fontSize: 14,
+    fontWeight: '600',
+    color: '#0F172A',
+    fontVariant: ['tabular-nums'],
+  },
+  barTrack: {
+    height: 8,
+    borderRadius: 4,
+    backgroundColor: '#E2E8F0',
+    overflow: 'hidden',
+  },
+  barFill: {
+    height: '100%',
+    borderRadius: 4,
+    backgroundColor: '#94A3B8',
+  },
+  barFillTop: {
+    backgroundColor: 'navy',
+  },
   input: {
     backgroundColor: '#F1F5F9',
     borderRadius: 10,
@@ -338,17 +421,8 @@ const styles = StyleSheet.create({
     minHeight: 40,
     textAlignVertical: 'top',
   },
-  buttonContainer: {
-    width: '100%',
-    gap: 10,
-  },
-  buttonGroup: {
-    flexDirection: 'row',
-    justifyContent: 'space-between',
-    gap: 10,
-  },
   buttonPrimary: {
-    flex: 1,
+    width: '100%',
     backgroundColor: 'navy',
     padding: 12,
     borderRadius: 10,
@@ -356,17 +430,6 @@ const styles = StyleSheet.create({
     alignItems: 'center',
     justifyContent: 'center',
   },
-  buttonSecondary: {
-    flex: 1,
-    backgroundColor: 'transparent',
-    borderWidth: 2,
-    borderColor: 'navy',
-    padding: 12,
-    borderRadius: 10,
-    flexDirection: 'row',
-    alignItems: 'center',
-    justifyContent: 'center',
-  },
   buttonDisabled: {
     backgroundColor: '#f0f0f0',
     borderColor: '#d3d3d3',
@@ -376,17 +439,9 @@ const styles = StyleSheet.create({
     textAlign: 'center',
     fontWeight: '500',
   },
-  buttonTextOutline: {
-    color: 'navy',
-    textAlign: 'center',
-    fontWeight: '500',
-  },
   buttonTextDisabled: {
     color: 'gray',
   },
-  topMatchesContainer: {
-    marginTop: 20,
-  },
   statsText: {
     fontSize: 13,
     color: '#64748B',
diff --git a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
index 76e0fb90c7..3315baa2dd 100644
--- a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
@@ -46,6 +46,25 @@ std::vector<uint64_t> TokenizerModule::encode(std::string s) const {
   return encodeResult.get();
 }
 
+std::vector<uint64_t>
+TokenizerModule::encodeWithSpecialTokens(std::string s) const {
+  if (!tokenizer) {
+    THROW_NOT_LOADED_ERROR();
+  }
+
+  // Passing non-zero bos/eos makes HFTokenizer run the tokenizer.json
+  // post_processor with add_special_token=true (the underlying encode treats
+  // these as a flag, not a literal count, when a post_processor is defined).
+  auto encodeResult = tokenizer->encode(s, /*bos=*/1, /*eos=*/1);
+  if (!encodeResult.ok()) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while encoding: " +
+            std::to_string(static_cast<int32_t>(encodeResult.error())));
+  }
+  return encodeResult.get();
+}
+
 std::string TokenizerModule::decode(std::vector<uint64_t> vec,
                                     bool skipSpecialTokens) const {
   if (!tokenizer) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
index 3c90b25557..a511340af6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
+++ b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
@@ -13,6 +13,12 @@ class TokenizerModule {
                            std::shared_ptr<react::CallInvoker> callInvoker);
   [[nodiscard("Registered non-void function")]] std::vector<uint64_t>
   encode(std::string s) const;
+  // Like encode, but applies the tokenizer.json post_processor (e.g.
+  // TemplateProcessing that prepends BOS). Needed by models whose pooling
+  // depends on the BOS/CLS token (e.g. CLS-pooled text embeddings). Not JS-
+  // bound; encode() keeps its single-arg signature for the JS API.
+  [[nodiscard("Registered non-void function")]] std::vector<uint64_t>
+  encodeWithSpecialTokens(std::string s) const;
   [[nodiscard("Registered non-void function")]] std::string
   decode(std::vector<uint64_t> vec, bool skipSpecialTokens) const;
   [[nodiscard("Registered non-void function")]] std::string
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index e4209b2f79..8e211f0028 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -17,6 +17,7 @@
 #include <rnexecutorch/jsi/OwningArrayBuffer.h>
 
 #include <rnexecutorch/metaprogramming/TypeConcepts.h>
+#include <rnexecutorch/models/embeddings/Types.h>
 #include <rnexecutorch/models/instance_segmentation/Types.h>
 #include <rnexecutorch/models/llm/Types.h>
 #include <rnexecutorch/models/object_detection/Constants.h>
@@ -707,6 +708,35 @@ getJsiValue(const models::style_transfer::PixelDataResult &result,
   return obj;
 }
 
+// Text embedding output: a [numTokens, embeddingDim] fp32 matrix + input token
+// ids. Pooled models give numTokens == 1; multi-vector give the full sequence.
+// The TS layer reduces to a single vector or keeps the matrix per model config.
+inline jsi::Value
+getJsiValue(const models::embeddings::EmbeddingResult &result,
+            jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+
+  auto arrayBuffer = jsi::ArrayBuffer(runtime, result.dataPtr);
+  auto float32ArrayCtor =
+      runtime.global().getPropertyAsFunction(runtime, "Float32Array");
+  auto float32Array =
+      float32ArrayCtor.callAsConstructor(runtime, arrayBuffer)
+          .getObject(runtime);
+  obj.setProperty(runtime, "dataPtr", float32Array);
+
+  obj.setProperty(runtime, "numTokens", jsi::Value(result.numTokens));
+  obj.setProperty(runtime, "embeddingDim", jsi::Value(result.embeddingDim));
+
+  auto idsArray = jsi::Array(runtime, result.tokenIds.size());
+  for (size_t i = 0; i < result.tokenIds.size(); ++i) {
+    idsArray.setValueAtIndex(
+        runtime, i, jsi::Value(static_cast<double>(result.tokenIds[i])));
+  }
+  obj.setProperty(runtime, "tokenIds", idsArray);
+
+  return obj;
+}
+
 inline jsi::Value getJsiValue(
     const rnexecutorch::models::semantic_segmentation::SegmentationResult
         &result,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/Types.h
new file mode 100644
index 0000000000..f2de1e899a
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/Types.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <rnexecutorch/jsi/OwningArrayBuffer.h>
+#include <vector>
+
+namespace rnexecutorch::models::embeddings {
+
+// Text embedding output as a [numTokens, embeddingDim] fp32 matrix. Pooled
+// single-vector models output numTokens == 1 (the exported graph pools + L2-
+// normalizes); multi-vector (late-interaction / ColBERT) models output
+// numTokens == sequence length. The TS layer reduces to a single vector or
+// keeps the per-token matrix based on the model's config. `tokenIds` are the
+// input ids (used JS-side for late-interaction skiplist masking).
+struct EmbeddingResult {
+  std::shared_ptr<OwningArrayBuffer> dataPtr;
+  int32_t numTokens;
+  int32_t embeddingDim;
+  std::vector<int64_t> tokenIds;
+};
+
+} // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
index ba2c3243b2..d673f0ac87 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
@@ -16,7 +16,10 @@ TextEmbeddings::TextEmbeddings(const std::string &modelSource,
           std::make_unique<TokenizerModule>(tokenizerSource, callInvoker)) {}
 
 TokenIdsWithAttentionMask TextEmbeddings::preprocess(const std::string &input) {
-  auto inputIds = tokenizer->encode(input);
+  // Apply the tokenizer's post_processor so declared special tokens (e.g. a
+  // BOS prepended via TemplateProcessing) are added. CLS-pooled embedding
+  // models read position 0, so a missing BOS corrupts the pooled vector.
+  auto inputIds = tokenizer->encodeWithSpecialTokens(input);
   // Tokenizers-cpp return tokens as int32, but text embedding models require
   // int64 as input
   std::vector<int64_t> inputIds64;
@@ -40,8 +43,7 @@ void TextEmbeddings::unload() noexcept {
   BaseModel::unload();
 }
 
-std::shared_ptr<OwningArrayBuffer>
-TextEmbeddings::generate(const std::string input) {
+EmbeddingResult TextEmbeddings::generate(const std::string input) {
   std::scoped_lock lock(inference_mutex_);
   auto preprocessed = preprocess(input);
 
@@ -58,7 +60,19 @@ TextEmbeddings::generate(const std::string input) {
   auto forwardResult = BaseModel::forward({tokenIds, attnMask});
   CHECK_OK_OR_THROW_FORWARD_ERROR(forwardResult);
 
-  return BaseEmbeddings::postprocess(forwardResult);
+  // Output is [1, numTokens, embeddingDim] (numTokens == 1 for pooled models,
+  // == sequence length for multi-vector models). Return the raw matrix + the
+  // input ids; the TS layer reduces to a single vector or keeps the matrix.
+  auto out = forwardResult->at(0).toTensor();
+  auto sizes = out.sizes();
+
+  EmbeddingResult result;
+  result.dataPtr = std::make_shared<OwningArrayBuffer>(out.const_data_ptr(),
+                                                       out.nbytes());
+  result.numTokens = static_cast<int32_t>(sizes[sizes.size() - 2]);
+  result.embeddingDim = static_cast<int32_t>(sizes[sizes.size() - 1]);
+  result.tokenIds = std::move(preprocessed.inputIds);
+  return result;
 }
 
 } // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
index 93d0988c04..cb6059b96e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
@@ -4,6 +4,7 @@
 #include <mutex>
 #include <rnexecutorch/TokenizerModule.h>
 #include <rnexecutorch/models/embeddings/BaseEmbeddings.h>
+#include <rnexecutorch/models/embeddings/Types.h>
 
 namespace rnexecutorch {
 namespace models::embeddings {
@@ -18,8 +19,11 @@ class TextEmbeddings final : public BaseEmbeddings {
   TextEmbeddings(const std::string &modelSource,
                  const std::string &tokenizerSource,
                  std::shared_ptr<react::CallInvoker> callInvoker);
-  [[nodiscard(
-      "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
+  // Returns the raw [numTokens, embeddingDim] output. Pooled models give
+  // numTokens == 1; multi-vector (late-interaction) models give the full
+  // sequence. The TS layer reduces to a single vector or keeps the matrix
+  // based on the model's config.
+  [[nodiscard("Registered non-void function")]] EmbeddingResult
   generate(const std::string input);
   void unload() noexcept;
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp
index 68a9a9fef4..6abbccb9c6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp
@@ -16,9 +16,12 @@ Encoder::Encoder(const std::string &tokenizerSource,
           encoderSource, tokenizerSource, callInvoker)) {}
 
 std::vector<float> Encoder::generate(std::string input) {
-  std::shared_ptr<OwningArrayBuffer> embeddingsText = encoder->generate(input);
+  // TextEmbeddings returns the raw [numTokens, embeddingDim] matrix; this
+  // encoder pools/uses the flat fp32 buffer directly (dataPtr).
+  std::shared_ptr<OwningArrayBuffer> embeddingsText =
+      encoder->generate(input).dataPtr;
   std::shared_ptr<OwningArrayBuffer> embeddingsUncond =
-      encoder->generate(std::string(constants::kBosToken));
+      encoder->generate(std::string(constants::kBosToken)).dataPtr;
 
   assert(embeddingsText->size() == embeddingsUncond->size());
   size_t embeddingsSize = embeddingsText->size() / sizeof(float);
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index eb0c98dae7..cb06ccb308 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -198,6 +198,7 @@ function pair<D extends { modelName: string }, Q extends { modelName: string }>(
   return variant({ xnnpack: { base: baseC, quant: quantC } });
 }
 
+
 // TTS presets bundle model + voice + phonemizer in a single config; they
 // don't share the `{ modelName: string }` shape of the rest of the registry,
 // and have no quant/backend axis. Expose them as a plain `() => Config`
@@ -260,6 +261,52 @@ const GEMMA4_E2B_MM_VARIANTS = {
   },
 };
 
+// Asymmetric query/document prompts the LFM models are trained with.
+// forward(text, role) auto-prepends these.
+const LFM_EMBEDDING_PROMPTS = { query: 'query: ', document: 'document: ' };
+const LFM_COLBERT_PROMPTS = { query: '[Q] ', document: '[D] ' };
+
+const LFM2_5_EMBEDDING_350M_VARIANTS = {
+  mlx: {
+    base: {
+      modelName: 'lfm2-5-embedding-350m' as const,
+      modelSource: M.LFM2_5_EMBEDDING_350M_MLX_MODEL,
+      tokenizerSource: M.LFM2_5_EMBEDDING_350M_TOKENIZER,
+      prompts: LFM_EMBEDDING_PROMPTS,
+    },
+  },
+  xnnpack: {
+    base: {
+      modelName: 'lfm2-5-embedding-350m' as const,
+      modelSource: M.LFM2_5_EMBEDDING_350M_XNNPACK_MODEL,
+      tokenizerSource: M.LFM2_5_EMBEDDING_350M_TOKENIZER,
+      prompts: LFM_EMBEDDING_PROMPTS,
+    },
+  },
+};
+
+// LFM2.5-ColBERT is a plain text-embedding model from the library's POV: it
+// returns per-token vectors. Late-interaction scoring (MaxSim / skiplist) is
+// the consumer's concern; the library only auto-applies the role prompts.
+const LFM2_5_COLBERT_350M_VARIANTS = {
+  mlx: {
+    base: {
+      modelName: 'lfm2-5-colbert-350m' as const,
+      modelSource: M.LFM2_5_COLBERT_350M_MLX_MODEL,
+      tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
+      prompts: LFM_COLBERT_PROMPTS,
+    },
+  },
+  xnnpack: {
+    base: {
+      modelName: 'lfm2-5-colbert-350m' as const,
+      modelSource: M.LFM2_5_COLBERT_350M_XNNPACK_MODEL,
+      tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
+      prompts: LFM_COLBERT_PROMPTS,
+    },
+  },
+};
+
 const EFFICIENTNET_V2_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -742,6 +789,17 @@ export const models = {
       M.PARAPHRASE_MULTILINGUAL_MINILM_L12_V2_QUANTIZED
     ),
     clip_vit_base_patch32_text: base(M.CLIP_VIT_BASE_PATCH32_TEXT),
+    lfm2_5_embedding_350m: variant(LFM2_5_EMBEDDING_350M_VARIANTS, {
+      ios: 'mlx',
+      android: 'xnnpack',
+    }),
+    // ColBERT (late-interaction): forward() returns per-token vectors. Scoring
+    // (markers / MaxSim / skiplist) is the consumer's concern — see the
+    // colbert example screen for a reference implementation.
+    lfm2_5_colbert_350m: variant(LFM2_5_COLBERT_350M_VARIANTS, {
+      ios: 'mlx',
+      android: 'xnnpack',
+    }),
   },
   image_embedding: {
     clip_vit_base_patch32_image: pair(
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 0e36f812ff..7c4b73483c 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -1197,6 +1197,21 @@ const PARAPHRASE_MULTILINGUAL_MINILM_L12_V2_QUANTIZED_MODEL = `${URL_PREFIX}-par
 const PARAPHRASE_MULTILINGUAL_MINILM_L12_V2_TOKENIZER = `${URL_PREFIX}-paraphrase-multilingual-MiniLM-L12-v2/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
 const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${PREVIOUS_VERSION_TAG}/xnnpack/clip_vit_base_patch32_text_xnnpack_fp32.pte`;
 const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
+// LFM2.5-Embedding-350M: XNNPACK 8da4w (Android/CPU), MLX int4 bf16 (iOS GPU,
+// physical device only). The exported graph bakes in CLS pooling + L2 norm.
+// Requires the runner to add the BOS special token (CLS-pooled at index 0).
+export const LFM2_5_EMBEDDING_350M_XNNPACK_MODEL = `${URL_PREFIX}-lfm2.5-embedding-350m/${PREVIOUS_VERSION_TAG}/xnnpack/lfm_2_5_embedding_350m_xnnpack_8da4w.pte`;
+export const LFM2_5_EMBEDDING_350M_MLX_MODEL = `${URL_PREFIX}-lfm2.5-embedding-350m/${PREVIOUS_VERSION_TAG}/mlx/lfm_2_5_embedding_350m_mlx_int4.pte`;
+export const LFM2_5_EMBEDDING_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-embedding-350m/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
+// LFM2.5-ColBERT-350M: late-interaction multi-vector retriever (per-token
+// [S,128]). Same bidirectional backbone as the embedding model + a Linear
+// 1024->128 head. forward() returns per-token vectors; late-interaction
+// scoring (MaxSim) is the consumer's concern (see the colbert example).
+// NOTE: pinned to `resolve/main` for testing — the v0.9.0 tag does not exist
+// on this repo yet. Switch to `${PREVIOUS_VERSION_TAG}` once the tag is cut.
+export const LFM2_5_COLBERT_350M_XNNPACK_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/resolve/main/xnnpack/lfm_2_5_colbert_350m_xnnpack_8da4w.pte`;
+export const LFM2_5_COLBERT_350M_MLX_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/resolve/main/mlx/lfm_2_5_colbert_350m_mlx_int4.pte`;
+export const LFM2_5_COLBERT_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-colbert-350m/resolve/main/tokenizer.json`;
 
 /**
  * @category Models - Text Embeddings
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts
index 31ee179925..b4679b4237 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts
@@ -1,20 +1,25 @@
 import { TextEmbeddingsModule } from '../../modules/natural_language_processing/TextEmbeddingsModule';
 import { useModuleFactory } from '../useModuleFactory';
 import {
+  AnyTextEmbeddingsModel,
+  EmbeddingRole,
+  ForwardFn,
   TextEmbeddingsType,
   TextEmbeddingsProps,
 } from '../../types/textEmbeddings';
 
 /**
- * React hook for managing a Text Embeddings model instance.
+ * React hook for a Text Embeddings model.
  * @category Hooks
- * @param TextEmbeddingsProps - Configuration object containing `model` source and optional `preventLoad` flag.
- * @returns Ready to use Text Embeddings model.
+ * @param TextEmbeddingsProps - `model` source + optional `preventLoad`.
+ * @returns Ready to use embeddings model. `forward` returns the raw
+ *   [numTokens, embeddingDim] result; use `toVector` for a single vector.
+ *   Models with prompts require a `role` ('query' | 'document') on `forward`.
  */
-export const useTextEmbeddings = ({
+export const useTextEmbeddings = <M extends AnyTextEmbeddingsModel>({
   model,
   preventLoad = false,
-}: TextEmbeddingsProps): TextEmbeddingsType => {
+}: TextEmbeddingsProps<M>): TextEmbeddingsType<M> => {
   const { error, isReady, isGenerating, downloadProgress, runForward } =
     useModuleFactory({
       factory: (config, onProgress) =>
@@ -24,7 +29,8 @@ export const useTextEmbeddings = ({
       preventLoad,
     });
 
-  const forward = (input: string) => runForward((inst) => inst.forward(input));
+  const forward = ((input: string, role?: EmbeddingRole) =>
+    runForward((inst) => inst.forward(input, role))) as ForwardFn<M>;
 
   return { error, isReady, isGenerating, downloadProgress, forward };
 };
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index 1f190d41f5..34cdf97d8d 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -212,6 +212,7 @@ export * from './utils/ResourceFetcher';
 export * from './utils/ResourceFetcherUtils';
 export * from './utils/BaseResourceFetcherClass';
 export * from './utils/llm';
+export * from './utils/textEmbeddings';
 export * from './common/Logger';
 export * from './utils/llms/context_strategy';
 export * from './utils/segmentAnythingPrompts';
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
index 27b0e59ceb..d9ab4f45da 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
@@ -1,5 +1,11 @@
 import { ResourceSource } from '../../types/common';
-import { TextEmbeddingsModelName } from '../../types/textEmbeddings';
+import {
+  AnyTextEmbeddingsModel,
+  EmbeddingPrompts,
+  EmbeddingResult,
+  EmbeddingRole,
+  TextEmbeddingsModelName,
+} from '../../types/textEmbeddings';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
 import { BaseModule } from '../BaseModule';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
@@ -7,27 +13,28 @@ import { parseUnknownError, RnExecutorchError } from '../../errors/errorUtils';
 import { Logger } from '../../common/Logger';
 
 /**
- * Module for generating text embeddings from input text.
+ * Module for text embeddings. Returns the raw [numTokens, embeddingDim] output
+ * for any model — pooled (numTokens === 1) or multi-vector. Scoring / pooling
+ * is the consumer's concern (see the `toVector` util for the single-vector
+ * common case).
  * @category Typescript API
  */
 export class TextEmbeddingsModule extends BaseModule {
-  private constructor(nativeModule: unknown) {
+  private prompts?: EmbeddingPrompts;
+
+  private constructor(nativeModule: unknown, prompts?: EmbeddingPrompts) {
     super();
     this.nativeModule = nativeModule;
+    this.prompts = prompts;
   }
 
   /**
    * Creates a text embeddings instance for a built-in model.
-   * @param namedSources - An object specifying which built-in model to load and where to fetch it from.
-   * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1.
-   * @returns A Promise resolving to a `TextEmbeddingsModule` instance.
+   * @param namedSources - The model + tokenizer sources.
+   * @param onDownloadProgress - Optional download progress callback (0..1).
    */
   static async fromModelName(
-    namedSources: {
-      modelName: TextEmbeddingsModelName;
-      modelSource: ResourceSource;
-      tokenizerSource: ResourceSource;
-    },
+    namedSources: AnyTextEmbeddingsModel,
     onDownloadProgress: (progress: number) => void = () => {}
   ): Promise<TextEmbeddingsModule> {
     try {
@@ -41,7 +48,8 @@ export class TextEmbeddingsModule extends BaseModule {
         throw new RnExecutorchError(RnExecutorchErrorCode.DownloadInterrupted);
       }
       return new TextEmbeddingsModule(
-        await global.loadTextEmbeddings(modelPath, tokenizerPath)
+        await global.loadTextEmbeddings(modelPath, tokenizerPath),
+        namedSources.prompts
       );
     } catch (error) {
       Logger.error('Load failed:', error);
@@ -50,14 +58,9 @@ export class TextEmbeddingsModule extends BaseModule {
   }
 
   /**
-   * Creates a text embeddings instance with a user-provided model binary and tokenizer.
-   * Use this when working with a custom-exported model that is not one of the built-in presets.
-   * @remarks The native model contract for this method is not formally defined and may change
-   * between releases. Refer to the native source code for the current expected tensor interface.
-   * @param modelSource - A fetchable resource pointing to the model binary.
-   * @param tokenizerSource - A fetchable resource pointing to the tokenizer file.
-   * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1.
-   * @returns A Promise resolving to a `TextEmbeddingsModule` instance.
+   * Creates a text embeddings instance from a custom model binary + tokenizer.
+   * @remarks The native tensor contract is not formally guaranteed across
+   * releases.
    */
   static fromCustomModel(
     modelSource: ResourceSource,
@@ -75,13 +78,24 @@ export class TextEmbeddingsModule extends BaseModule {
   }
 
   /**
-   * Executes the model's forward pass to generate an embedding for the provided text.
-   * @param input - The text string to embed.
-   * @returns A Promise resolving to a `Float32Array` containing the embedding vector.
+   * Embed text. Returns the raw [numTokens, embeddingDim] result.
+   * @param input - The text to embed.
+   * @param role - Optional 'query' | 'document'; prepends the model's prompt
+   *   for that role when configured (no-op otherwise).
    */
-  async forward(input: string): Promise<Float32Array> {
+  async forward(
+    input: string,
+    role?: EmbeddingRole
+  ): Promise<EmbeddingResult> {
     if (this.nativeModule == null)
       throw new RnExecutorchError(RnExecutorchErrorCode.ModuleNotLoaded);
-    return new Float32Array(await this.nativeModule.generate(input));
+    const prefix = (role && this.prompts?.[role]) || '';
+    const res = await this.nativeModule.generate(prefix + input);
+    return {
+      vectors: new Float32Array(res.dataPtr),
+      numTokens: res.numTokens,
+      embeddingDim: res.embeddingDim,
+      tokenIds: res.tokenIds,
+    };
   }
 }
diff --git a/packages/react-native-executorch/src/types/textEmbeddings.ts b/packages/react-native-executorch/src/types/textEmbeddings.ts
index d9cd120e26..47e056794f 100644
--- a/packages/react-native-executorch/src/types/textEmbeddings.ts
+++ b/packages/react-native-executorch/src/types/textEmbeddings.ts
@@ -12,65 +12,108 @@ export type TextEmbeddingsModelName =
   | 'multi-qa-mpnet-base-dot-v1'
   | 'distiluse-base-multilingual-cased-v2-8da4w'
   | 'paraphrase-multilingual-minilm-l12-v2-quantized'
-  | 'clip-vit-base-patch32-text';
+  | 'clip-vit-base-patch32-text'
+  | 'lfm2-5-embedding-350m'
+  | 'lfm2-5-colbert-350m';
+
+/**
+ * Raw text embedding output: a [numTokens, embeddingDim] fp32 matrix (row-
+ * major) plus the input token ids. Single-vector (pooled) models give
+ * numTokens === 1 — use `toVector` for that common case. Multi-vector (late-
+ * interaction, e.g. ColBERT) models give the full per-token sequence; scoring
+ * (e.g. MaxSim) is the consumer's concern.
+ * @category Types
+ */
+export interface EmbeddingResult {
+  /** Flat [numTokens * embeddingDim] fp32 vectors (row-major). */
+  vectors: Float32Array;
+  /** Number of token rows (1 for pooled models). */
+  numTokens: number;
+  /** Per-token vector dimension. */
+  embeddingDim: number;
+  /** Input token ids per row. */
+  tokenIds: number[];
+}
+
+/**
+ * Role for `forward`. Some models are trained with asymmetric query/document
+ * prompts (e.g. LFM2.5 uses `query: `/`document: `, ColBERT uses `[Q] `/`[D] `).
+ * Passing a role auto-prepends the model's configured prompt for that role.
+ * @category Types
+ */
+export type EmbeddingRole = 'query' | 'document';
+
+/**
+ * Asymmetric prompts a model is trained with. When a model config carries
+ * these, `forward` REQUIRES a `role` so the matching prompt is always applied
+ * (forgetting it would silently embed raw text and wreck asymmetric retrieval).
+ * @category Types
+ */
+export interface EmbeddingPrompts {
+  query: string;
+  document: string;
+}
+
+/** A standard (symmetric) embedding model — `forward(text)`, no role. */
+export interface TextEmbeddingsModel {
+  modelName: TextEmbeddingsModelName;
+  modelSource: ResourceSource;
+  tokenizerSource: ResourceSource;
+  prompts?: undefined;
+}
+
+/**
+ * An asymmetric model with query/document prompts — `forward(text, role)` with
+ * role REQUIRED.
+ */
+export interface PromptedTextEmbeddingsModel {
+  modelName: TextEmbeddingsModelName;
+  modelSource: ResourceSource;
+  tokenizerSource: ResourceSource;
+  prompts: EmbeddingPrompts;
+}
+
+export type AnyTextEmbeddingsModel =
+  | TextEmbeddingsModel
+  | PromptedTextEmbeddingsModel;
+
+/**
+ * `forward`'s signature, discriminated by the model: prompted models require a
+ * `role` argument; standard models take none.
+ */
+export type ForwardFn<M extends AnyTextEmbeddingsModel> =
+  M extends PromptedTextEmbeddingsModel
+    ? (input: string, role: EmbeddingRole) => Promise<EmbeddingResult>
+    : (input: string) => Promise<EmbeddingResult>;
 
 /**
  * Props for the useTextEmbeddings hook.
  * @category Types
- * @property {object} model - An object containing the model configuration.
- * @property {TextEmbeddingsModelName} model.modelName - Unique name identifying the model.
- * @property {ResourceSource} model.modelSource - The source of the text embeddings model binary.
- * @property {ResourceSource} model.tokenizerSource - The source of the tokenizer JSON file.
- * @property {boolean} [preventLoad] - Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
  */
-export interface TextEmbeddingsProps {
-  model: {
-    /**
-     * The unique name of the text embeddings model.
-     */
-    modelName: TextEmbeddingsModelName;
-    /**
-     * The source of the text embeddings model binary.
-     */
-    modelSource: ResourceSource;
-    /**
-     * The source of the tokenizer JSON file.
-     */
-    tokenizerSource: ResourceSource;
-  };
+export interface TextEmbeddingsProps<
+  M extends AnyTextEmbeddingsModel = AnyTextEmbeddingsModel,
+> {
+  model: M;
   preventLoad?: boolean;
 }
 
 /**
- * React hook state and methods for managing a Text Embeddings model instance.
+ * React hook state and methods for a Text Embeddings model instance.
  * @category Types
  */
-export interface TextEmbeddingsType {
-  /**
-   * Contains the error message if the model failed to load or during inference.
-   */
+export interface TextEmbeddingsType<
+  M extends AnyTextEmbeddingsModel = AnyTextEmbeddingsModel,
+> {
   error: null | RnExecutorchError;
-
-  /**
-   * Indicates whether the embeddings model has successfully loaded and is ready for inference.
-   */
   isReady: boolean;
-
-  /**
-   * Indicates whether the model is currently generating embeddings.
-   */
   isGenerating: boolean;
-
-  /**
-   * Tracks the progress of the model download process (value between 0 and 1).
-   */
   downloadProgress: number;
 
   /**
-   * Runs the text embeddings model on the provided input string.
-   * @param input - The text string to embed.
-   * @returns A promise resolving to a Float32Array containing the vector embeddings.
-   * @throws {RnExecutorchError} If the model is not loaded or is currently processing another request.
+   * Embed text into a [numTokens, embeddingDim] result. Pooled models return
+   * numTokens === 1 (use `toVector`); multi-vector models return the full
+   * per-token sequence. Models with prompts require a `role`
+   * ('query' | 'document'); standard models take none.
    */
-  forward(input: string): Promise<Float32Array>;
+  forward: ForwardFn<M>;
 }
diff --git a/packages/react-native-executorch/src/utils/textEmbeddings.ts b/packages/react-native-executorch/src/utils/textEmbeddings.ts
new file mode 100644
index 0000000000..c396145489
--- /dev/null
+++ b/packages/react-native-executorch/src/utils/textEmbeddings.ts
@@ -0,0 +1,74 @@
+import { EmbeddingResult } from '../types/textEmbeddings';
+
+/**
+ * Get the single pooled embedding vector from a result. Convenience for the
+ * common single-vector case: the exported graph pools + L2-normalizes to a
+ * [1, embeddingDim] output, so this returns row 0.
+ *
+ * For multi-vector (late-interaction) models, prefer the full per-token
+ * vectors (`getTokenVectors`); row 0 alone is not a meaningful sentence
+ * embedding there.
+ *
+ * @category Utils
+ */
+export function toVector(result: EmbeddingResult): Float32Array {
+  return result.vectors.slice(0, result.embeddingDim);
+}
+
+/**
+ * Split a result's flat `vectors` buffer into per-token rows
+ * (`numTokens` arrays of length `embeddingDim`). Useful for inspecting or
+ * storing individual token vectors (e.g. a multi-vector vector DB).
+ *
+ * @category Utils
+ */
+export function getTokenVectors(result: EmbeddingResult): Float32Array[] {
+  const { vectors, numTokens, embeddingDim } = result;
+  const rows: Float32Array[] = [];
+  for (let i = 0; i < numTokens; i++) {
+    rows.push(vectors.subarray(i * embeddingDim, (i + 1) * embeddingDim));
+  }
+  return rows;
+}
+
+/**
+ * Late-interaction MaxSim score between a query and a document encoding:
+ *
+ *   score = Σ_q  max_d ( q · d )
+ *
+ * For each query token, takes the max dot product over all (non-skiplist)
+ * document tokens, then sums across query tokens. Per-token vectors are
+ * L2-normalized by the graph, so a dot product is a cosine.
+ *
+ * `skiplistIds` (e.g. punctuation token ids) are excluded from the document
+ * side, matching ColBERT's document skiplist. Pass `[]` to score every token.
+ *
+ * @category Utils
+ */
+export function maxSim(
+  query: EmbeddingResult,
+  doc: EmbeddingResult,
+  skiplistIds: number[] = []
+): number {
+  const dim = query.embeddingDim;
+  const q = query.vectors;
+  const d = doc.vectors;
+  const skip = new Set(skiplistIds);
+
+  let score = 0;
+  for (let qi = 0; qi < query.numTokens; qi++) {
+    const qOff = qi * dim;
+    let best = -Infinity;
+    for (let di = 0; di < doc.numTokens; di++) {
+      if (skip.has(doc.tokenIds[di]!)) continue;
+      const dOff = di * dim;
+      let dot = 0;
+      for (let k = 0; k < dim; k++) {
+        dot += (q[qOff + k] ?? 0) * (d[dOff + k] ?? 0);
+      }
+      if (dot > best) best = dot;
+    }
+    if (best !== -Infinity) score += best;
+  }
+  return score;
+}

From b2e7e78917bd846773efd6d1f246869ad18f07e9 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 14:34:58 +0200
Subject: [PATCH 2/7] fix: address review on text-embeddings/ColBERT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Migrate the segment-anything (SAM) screen to toVector(forward()) — its
  CLIP-text path broke when forward started returning EmbeddingResult.
- Update the C++ TextEmbeddings integration test for the EmbeddingResult
  return type (was still using the old OwningArrayBuffer pointer API).
- Guard the per-token invariant: throw InvalidModelOutput if output rows
  != input token count (pooled numTokens==1 exempt), so skiplist masking
  can't silently misalign if a graph pads/truncates.
- Dedup encode()/encodeWithSpecialTokens() into a shared encodeImpl.
- Drop the redundant Float32Array copy at the JSI boundary; document the
  getTokenVectors view lifetime; remove dead BaseEmbeddings::postprocess.

Authored with Claude.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../app/segment_anything/index.tsx            |  3 +-
 .../common/rnexecutorch/TokenizerModule.cpp   | 34 ++++++-------------
 .../common/rnexecutorch/TokenizerModule.h     |  5 +++
 .../models/embeddings/BaseEmbeddings.cpp      | 10 ------
 .../models/embeddings/BaseEmbeddings.h        |  4 ---
 .../models/embeddings/text/TextEmbeddings.cpp | 15 ++++++++
 .../tests/integration/TextEmbeddingsTest.cpp  | 30 ++++++++--------
 .../src/constants/modelRegistry.ts            |  1 -
 .../src/constants/modelUrls.ts                |  8 ++---
 .../TextEmbeddingsModule.ts                   |  4 ++-
 .../src/utils/textEmbeddings.ts               |  5 +++
 11 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
index ac7bbd06b5..0a7af9e1ed 100644
--- a/apps/computer-vision/app/segment_anything/index.tsx
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -25,6 +25,7 @@ import {
   useInstanceSegmentation,
   useImageEmbeddings,
   useTextEmbeddings,
+  toVector,
   InstanceSegmentationModelSources,
   SegmentedInstance,
   FastSAMLabel,
@@ -208,7 +209,7 @@ export default function SegmentAnythingScreen() {
         instanceEmbeddingsRef.current = embeddings;
         setEmbeddingProgress(null);
       }
-      const textEmb = await clipText.forward(textPrompt);
+      const textEmb = toVector(await clipText.forward(textPrompt));
       const match = selectByText(
         instances,
         instanceEmbeddingsRef.current,
diff --git a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
index 3315baa2dd..dfd9243c48 100644
--- a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.cpp
@@ -26,17 +26,15 @@ TokenizerModule::TokenizerModule(
   memorySizeLowerBound = std::filesystem::file_size(modelPath);
 }
 
-std::vector<uint64_t> TokenizerModule::encode(std::string s) const {
+// When the tokenizer.json defines a post_processor, the underlying HFTokenizer
+// treats non-zero bos/eos as a flag to run it with add_special_token=true (not
+// a literal count). So bos=eos=0 skips special tokens; bos=eos=1 applies them.
+std::vector<uint64_t> TokenizerModule::encodeImpl(const std::string &s,
+                                                  int8_t bos, int8_t eos) const {
   if (!tokenizer) {
     THROW_NOT_LOADED_ERROR();
   }
-
-  // If the used tokenizer.json has defined post_processor field,
-  // setting any of bos or eos arguments to value other than provided constant
-  // ( which is 0) will result in running the post_processor with
-  // 'add_special_token' flag
-  auto encodeResult =
-      tokenizer->encode(s, numOfAddedBoSTokens, numOfAddedEoSTokens);
+  auto encodeResult = tokenizer->encode(s, bos, eos);
   if (!encodeResult.ok()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::TokenizerError,
@@ -46,23 +44,13 @@ std::vector<uint64_t> TokenizerModule::encode(std::string s) const {
   return encodeResult.get();
 }
 
+std::vector<uint64_t> TokenizerModule::encode(std::string s) const {
+  return encodeImpl(s, numOfAddedBoSTokens, numOfAddedEoSTokens);
+}
+
 std::vector<uint64_t>
 TokenizerModule::encodeWithSpecialTokens(std::string s) const {
-  if (!tokenizer) {
-    THROW_NOT_LOADED_ERROR();
-  }
-
-  // Passing non-zero bos/eos makes HFTokenizer run the tokenizer.json
-  // post_processor with add_special_token=true (the underlying encode treats
-  // these as a flag, not a literal count, when a post_processor is defined).
-  auto encodeResult = tokenizer->encode(s, /*bos=*/1, /*eos=*/1);
-  if (!encodeResult.ok()) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occurred while encoding: " +
-            std::to_string(static_cast<int32_t>(encodeResult.error())));
-  }
-  return encodeResult.get();
+  return encodeImpl(s, /*bos=*/1, /*eos=*/1);
 }
 
 std::string TokenizerModule::decode(std::vector<uint64_t> vec,
diff --git a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
index a511340af6..09877dfc65 100644
--- a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
+++ b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
@@ -30,6 +30,11 @@ class TokenizerModule {
   std::size_t getMemoryLowerBound() const noexcept;
 
 private:
+  // Shared encode implementation. bos/eos act as an add-special-tokens flag
+  // (not a literal count) when the tokenizer.json defines a post_processor.
+  std::vector<uint64_t> encodeImpl(const std::string &s, int8_t bos,
+                                   int8_t eos) const;
+
   std::unique_ptr<tokenizers::HFTokenizer> tokenizer;
   std::size_t memorySizeLowerBound{0};
 };
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp
index bf291136c1..e777be6704 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp
@@ -1,19 +1,9 @@
 #include "BaseEmbeddings.h"
 
-#include <span>
-
 namespace rnexecutorch::models::embeddings {
 
 BaseEmbeddings::BaseEmbeddings(const std::string &modelSource,
                                std::shared_ptr<react::CallInvoker> callInvoker)
     : BaseModel(modelSource, callInvoker) {}
 
-std::shared_ptr<OwningArrayBuffer>
-BaseEmbeddings::postprocess(const Result<std::vector<EValue>> &forwardResult) {
-  auto forwardResultTensor = forwardResult->at(0).toTensor();
-  auto buffer = std::make_shared<OwningArrayBuffer>(
-      forwardResultTensor.const_data_ptr(), forwardResultTensor.nbytes());
-  return buffer;
-}
-
 } // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h
index 216d6bf8ce..4b37a3fe93 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h
@@ -8,10 +8,6 @@ class BaseEmbeddings : public BaseModel {
 public:
   BaseEmbeddings(const std::string &modelSource,
                  std::shared_ptr<react::CallInvoker> callInvoker);
-
-protected:
-  std::shared_ptr<OwningArrayBuffer>
-  postprocess(const Result<std::vector<EValue>> &forwardResult);
 };
 
 }; // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
index d673f0ac87..26f3157690 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
@@ -72,6 +72,21 @@ EmbeddingResult TextEmbeddings::generate(const std::string input) {
   result.numTokens = static_cast<int32_t>(sizes[sizes.size() - 2]);
   result.embeddingDim = static_cast<int32_t>(sizes[sizes.size() - 1]);
   result.tokenIds = std::move(preprocessed.inputIds);
+
+  // Invariant for multi-vector models: one output row per input token, so
+  // numTokens (from the output tensor) must equal tokenIds.size() (from the
+  // input). Consumers index tokenIds[i] per output row (e.g. skiplist masking),
+  // which silently breaks if the graph ever pads/truncates the sequence.
+  // (Pooled models legitimately collapse to numTokens == 1.)
+  if (result.numTokens != 1 &&
+      result.numTokens != static_cast<int32_t>(result.tokenIds.size())) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidModelOutput,
+        "Embedding output rows (" + std::to_string(result.numTokens) +
+            ") != input tokens (" +
+            std::to_string(result.tokenIds.size()) +
+            "); per-token tokenIds alignment is broken.");
+  }
   return result;
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextEmbeddingsTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextEmbeddingsTest.cpp
index ff1abd4c30..0e0cc846b5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextEmbeddingsTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextEmbeddingsTest.cpp
@@ -53,23 +53,23 @@ TEST(TextEmbeddingsGenerateTests, EmptyStringReturnsResults) {
   TextEmbeddings model(kValidTextEmbeddingsModelPath,
                        kValidTextEmbeddingsTokenizerPath, nullptr);
   auto result = model.generate("");
-  EXPECT_NE(result, nullptr);
-  EXPECT_GT(result->size(), 0u);
+  EXPECT_NE(result.dataPtr, nullptr);
+  EXPECT_GT(result.dataPtr->size(), 0u);
 }
 
 TEST(TextEmbeddingsGenerateTests, ValidTextReturnsResults) {
   TextEmbeddings model(kValidTextEmbeddingsModelPath,
                        kValidTextEmbeddingsTokenizerPath, nullptr);
   auto result = model.generate("Hello, world!");
-  EXPECT_NE(result, nullptr);
-  EXPECT_GT(result->size(), 0u);
+  EXPECT_NE(result.dataPtr, nullptr);
+  EXPECT_GT(result.dataPtr->size(), 0u);
 }
 
 TEST(TextEmbeddingsGenerateTests, ResultsHaveCorrectSize) {
   TextEmbeddings model(kValidTextEmbeddingsModelPath,
                        kValidTextEmbeddingsTokenizerPath, nullptr);
   auto result = model.generate("This is a test sentence.");
-  size_t numFloats = result->size() / sizeof(float);
+  size_t numFloats = result.dataPtr->size() / sizeof(float);
   EXPECT_EQ(numFloats, kMiniLmEmbeddingDimensions);
 }
 
@@ -78,8 +78,8 @@ TEST(TextEmbeddingsGenerateTests, ResultsAreNormalized) {
                        kValidTextEmbeddingsTokenizerPath, nullptr);
   auto result = model.generate("The quick brown fox jumps over the lazy dog.");
 
-  const float *data = reinterpret_cast<const float *>(result->data());
-  size_t numFloats = result->size() / sizeof(float);
+  const float *data = reinterpret_cast<const float *>(result.dataPtr->data());
+  size_t numFloats = result.dataPtr->size() / sizeof(float);
 
   float sumOfSquares = 0.0f;
   for (size_t i = 0; i < numFloats; ++i) {
@@ -94,8 +94,8 @@ TEST(TextEmbeddingsGenerateTests, ResultsContainValidValues) {
                        kValidTextEmbeddingsTokenizerPath, nullptr);
   auto result = model.generate("Testing valid values.");
 
-  const float *data = reinterpret_cast<const float *>(result->data());
-  size_t numFloats = result->size() / sizeof(float);
+  const float *data = reinterpret_cast<const float *>(result.dataPtr->data());
+  size_t numFloats = result.dataPtr->size() / sizeof(float);
 
   for (size_t i = 0; i < numFloats; ++i) {
     EXPECT_FALSE(std::isnan(data[i]));
@@ -110,9 +110,9 @@ TEST(TextEmbeddingsGenerateTests, DifferentTextProducesDifferentEmbeddings) {
   auto result1 = model.generate("Hello, world!");
   auto result2 = model.generate("Goodbye, moon!");
 
-  const float *data1 = reinterpret_cast<const float *>(result1->data());
-  const float *data2 = reinterpret_cast<const float *>(result2->data());
-  size_t numFloats = result1->size() / sizeof(float);
+  const float *data1 = reinterpret_cast<const float *>(result1.dataPtr->data());
+  const float *data2 = reinterpret_cast<const float *>(result2.dataPtr->data());
+  size_t numFloats = result1.dataPtr->size() / sizeof(float);
 
   bool allEqual = true;
   for (size_t i = 0; i < numFloats; ++i) {
@@ -131,9 +131,9 @@ TEST(TextEmbeddingsGenerateTests, SimilarTextProducesSimilarEmbeddings) {
   auto result1 = model.generate("I love programming");
   auto result2 = model.generate("I enjoy coding");
 
-  const float *data1 = reinterpret_cast<const float *>(result1->data());
-  const float *data2 = reinterpret_cast<const float *>(result2->data());
-  size_t numFloats = result1->size() / sizeof(float);
+  const float *data1 = reinterpret_cast<const float *>(result1.dataPtr->data());
+  const float *data2 = reinterpret_cast<const float *>(result2.dataPtr->data());
+  size_t numFloats = result1.dataPtr->size() / sizeof(float);
 
   float dotProduct = 0.0f;
   for (size_t i = 0; i < numFloats; ++i) {
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index cb06ccb308..f411631aac 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -198,7 +198,6 @@ function pair<D extends { modelName: string }, Q extends { modelName: string }>(
   return variant({ xnnpack: { base: baseC, quant: quantC } });
 }
 
-
 // TTS presets bundle model + voice + phonemizer in a single config; they
 // don't share the `{ modelName: string }` shape of the rest of the registry,
 // and have no quant/backend axis. Expose them as a plain `() => Config`
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 7c4b73483c..8fdebb1a6d 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -1207,11 +1207,9 @@ export const LFM2_5_EMBEDDING_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-embedding-3
 // [S,128]). Same bidirectional backbone as the embedding model + a Linear
 // 1024->128 head. forward() returns per-token vectors; late-interaction
 // scoring (MaxSim) is the consumer's concern (see the colbert example).
-// NOTE: pinned to `resolve/main` for testing — the v0.9.0 tag does not exist
-// on this repo yet. Switch to `${PREVIOUS_VERSION_TAG}` once the tag is cut.
-export const LFM2_5_COLBERT_350M_XNNPACK_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/resolve/main/xnnpack/lfm_2_5_colbert_350m_xnnpack_8da4w.pte`;
-export const LFM2_5_COLBERT_350M_MLX_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/resolve/main/mlx/lfm_2_5_colbert_350m_mlx_int4.pte`;
-export const LFM2_5_COLBERT_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-colbert-350m/resolve/main/tokenizer.json`;
+export const LFM2_5_COLBERT_350M_XNNPACK_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/${PREVIOUS_VERSION_TAG}/xnnpack/lfm_2_5_colbert_350m_xnnpack_8da4w.pte`;
+export const LFM2_5_COLBERT_350M_MLX_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/${PREVIOUS_VERSION_TAG}/mlx/lfm_2_5_colbert_350m_mlx_int4.pte`;
+export const LFM2_5_COLBERT_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-colbert-350m/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
 
 /**
  * @category Models - Text Embeddings
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
index d9ab4f45da..c11b9c9aff 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
@@ -91,8 +91,10 @@ export class TextEmbeddingsModule extends BaseModule {
       throw new RnExecutorchError(RnExecutorchErrorCode.ModuleNotLoaded);
     const prefix = (role && this.prompts?.[role]) || '';
     const res = await this.nativeModule.generate(prefix + input);
+    // res.dataPtr is already a Float32Array view over the owned native buffer
+    // (built at the JSI boundary), so use it directly — no extra copy.
     return {
-      vectors: new Float32Array(res.dataPtr),
+      vectors: res.dataPtr as Float32Array,
       numTokens: res.numTokens,
       embeddingDim: res.embeddingDim,
       tokenIds: res.tokenIds,
diff --git a/packages/react-native-executorch/src/utils/textEmbeddings.ts b/packages/react-native-executorch/src/utils/textEmbeddings.ts
index c396145489..da10d9aa08 100644
--- a/packages/react-native-executorch/src/utils/textEmbeddings.ts
+++ b/packages/react-native-executorch/src/utils/textEmbeddings.ts
@@ -20,6 +20,11 @@ export function toVector(result: EmbeddingResult): Float32Array {
  * (`numTokens` arrays of length `embeddingDim`). Useful for inspecting or
  * storing individual token vectors (e.g. a multi-vector vector DB).
  *
+ * The rows are zero-copy `subarray` VIEWS over `result.vectors` — valid only
+ * while that buffer is alive and not mutated. Copy them (e.g. `new
+ * Float32Array(row)`) before storing beyond the result's lifetime. (`toVector`
+ * by contrast returns an independent copy.)
+ *
  * @category Utils
  */
 export function getTokenVectors(result: EmbeddingResult): Float32Array[] {

From cf74973f94ada255bcc00eaa910b724ff50658c4 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 14:50:16 +0200
Subject: [PATCH 3/7] refactor: make useTextEmbeddings.forward non-breaking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

forward(text) returns a single pooled Float32Array again for standard
models — restoring the original API, so MiniLM/MPNet/CLIP/SAM consumers
need no migration. The reduction (row 0 of the native [numTokens,
embeddingDim] matrix) happens in the TS module, not at the call site.

Multi-vector (late-interaction) models opt in via a `multiVector: true`
config flag; for those, forward returns the full per-token EmbeddingResult
so MaxSim/skiplist work. Return type is discriminated by the flag, and the
role argument by `prompts` (required when prompted, none when not).

Authored with Claude.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../app/segment_anything/index.tsx            |  3 +-
 .../app/clip-embeddings/index.tsx             |  3 +-
 .../app/text-embeddings/index.tsx             | 14 ++--
 .../src/constants/modelRegistry.ts            |  2 +
 .../useTextEmbeddings.ts                      |  4 +-
 .../TextEmbeddingsModule.ts                   | 44 ++++++++----
 .../src/types/textEmbeddings.ts               | 70 ++++++++++---------
 7 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
index 0a7af9e1ed..ac7bbd06b5 100644
--- a/apps/computer-vision/app/segment_anything/index.tsx
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -25,7 +25,6 @@ import {
   useInstanceSegmentation,
   useImageEmbeddings,
   useTextEmbeddings,
-  toVector,
   InstanceSegmentationModelSources,
   SegmentedInstance,
   FastSAMLabel,
@@ -209,7 +208,7 @@ export default function SegmentAnythingScreen() {
         instanceEmbeddingsRef.current = embeddings;
         setEmbeddingProgress(null);
       }
-      const textEmb = toVector(await clipText.forward(textPrompt));
+      const textEmb = await clipText.forward(textPrompt);
       const match = selectByText(
         instances,
         instanceEmbeddingsRef.current,
diff --git a/apps/text-embeddings/app/clip-embeddings/index.tsx b/apps/text-embeddings/app/clip-embeddings/index.tsx
index e0232d3440..02a8a9c656 100644
--- a/apps/text-embeddings/app/clip-embeddings/index.tsx
+++ b/apps/text-embeddings/app/clip-embeddings/index.tsx
@@ -16,7 +16,6 @@ import {
   models,
   useTextEmbeddings,
   useImageEmbeddings,
-  toVector,
   ImageEmbeddingsProps,
 } from 'react-native-executorch';
 
@@ -102,7 +101,7 @@ function ClipEmbeddingsScreen() {
       const txtStart = Date.now();
       const scored: { label: string; similarity: number }[] = [];
       for (const label of labels) {
-        const textEmbedding = toVector(await textModel.forward(label));
+        const textEmbedding = await textModel.forward(label);
         scored.push({
           label,
           similarity: dotProduct(imageEmbedding, textEmbedding),
diff --git a/apps/text-embeddings/app/text-embeddings/index.tsx b/apps/text-embeddings/app/text-embeddings/index.tsx
index 470094da02..8cb6777843 100644
--- a/apps/text-embeddings/app/text-embeddings/index.tsx
+++ b/apps/text-embeddings/app/text-embeddings/index.tsx
@@ -15,13 +15,12 @@ import { ModelPicker } from '../../components/ModelPicker';
 import {
   models,
   useTextEmbeddings,
-  toVector,
   TextEmbeddingsProps,
 } from 'react-native-executorch';
 const textEmbedding = models.text_embedding;
 
-// Single-vector (pooled) models: forward() returns the raw result; toVector()
-// gives the single embedding. The multi-vector ColBERT model has its own screen.
+// Single-vector (pooled) models: forward() returns a Float32Array directly.
+// The multi-vector ColBERT model has its own screen.
 type TextEmbeddingModel = TextEmbeddingsProps['model'];
 
 const MODELS: { label: string; value: TextEmbeddingModel }[] = [
@@ -123,10 +122,9 @@ function TextEmbeddingsScreen() {
           const embedded = [];
           for (const sentence of CORPUS) {
             // forward(_, 'document') auto-applies the model's document prompt
-            // (a no-op for models without one).
-            const embedding = toVector(
-              await model.forward(sentence, 'document')
-            );
+            // (a no-op for models without one). Single-vector models return
+            // a Float32Array directly.
+            const embedding = await model.forward(sentence, 'document');
             if (cancelled) return;
             embedded.push({ sentence, embedding });
           }
@@ -157,7 +155,7 @@ function TextEmbeddingsScreen() {
     setQuery(queryText);
     try {
       const start = Date.now();
-      const queryEmbedding = toVector(await model.forward(q, 'query'));
+      const queryEmbedding = await model.forward(q, 'query');
       setEmbeddingTime(Date.now() - start);
       const ranked = corpusEmbeddings
         .map(({ sentence, embedding }) => ({
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index f411631aac..c2e3a2a21d 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -294,6 +294,7 @@ const LFM2_5_COLBERT_350M_VARIANTS = {
       modelSource: M.LFM2_5_COLBERT_350M_MLX_MODEL,
       tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
       prompts: LFM_COLBERT_PROMPTS,
+      multiVector: true as const,
     },
   },
   xnnpack: {
@@ -302,6 +303,7 @@ const LFM2_5_COLBERT_350M_VARIANTS = {
       modelSource: M.LFM2_5_COLBERT_350M_XNNPACK_MODEL,
       tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
       prompts: LFM_COLBERT_PROMPTS,
+      multiVector: true as const,
     },
   },
 };
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts
index b4679b4237..2f100b8cbb 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextEmbeddings.ts
@@ -1,9 +1,9 @@
 import { TextEmbeddingsModule } from '../../modules/natural_language_processing/TextEmbeddingsModule';
 import { useModuleFactory } from '../useModuleFactory';
 import {
-  AnyTextEmbeddingsModel,
   EmbeddingRole,
   ForwardFn,
+  TextEmbeddingsModel,
   TextEmbeddingsType,
   TextEmbeddingsProps,
 } from '../../types/textEmbeddings';
@@ -16,7 +16,7 @@ import {
  *   [numTokens, embeddingDim] result; use `toVector` for a single vector.
  *   Models with prompts require a `role` ('query' | 'document') on `forward`.
  */
-export const useTextEmbeddings = <M extends AnyTextEmbeddingsModel>({
+export const useTextEmbeddings = <M extends TextEmbeddingsModel>({
   model,
   preventLoad = false,
 }: TextEmbeddingsProps<M>): TextEmbeddingsType<M> => {
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
index c11b9c9aff..abb620e981 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextEmbeddingsModule.ts
@@ -1,9 +1,9 @@
 import { ResourceSource } from '../../types/common';
 import {
-  AnyTextEmbeddingsModel,
   EmbeddingPrompts,
   EmbeddingResult,
   EmbeddingRole,
+  TextEmbeddingsModel,
   TextEmbeddingsModelName,
 } from '../../types/textEmbeddings';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
@@ -13,28 +13,35 @@ import { parseUnknownError, RnExecutorchError } from '../../errors/errorUtils';
 import { Logger } from '../../common/Logger';
 
 /**
- * Module for text embeddings. Returns the raw [numTokens, embeddingDim] output
- * for any model — pooled (numTokens === 1) or multi-vector. Scoring / pooling
- * is the consumer's concern (see the `toVector` util for the single-vector
- * common case).
+ * Module for text embeddings. `forward` returns a single pooled `Float32Array`
+ * for standard models, or the per-token `EmbeddingResult` for `multiVector`
+ * (late-interaction) models. The native runner always produces the raw
+ * [numTokens, embeddingDim] matrix; the reduction to a single vector happens
+ * here so the common single-vector API stays `Float32Array`.
  * @category Typescript API
  */
 export class TextEmbeddingsModule extends BaseModule {
   private prompts?: EmbeddingPrompts;
+  private multiVector: boolean;
 
-  private constructor(nativeModule: unknown, prompts?: EmbeddingPrompts) {
+  private constructor(
+    nativeModule: unknown,
+    prompts: EmbeddingPrompts | undefined,
+    multiVector: boolean
+  ) {
     super();
     this.nativeModule = nativeModule;
     this.prompts = prompts;
+    this.multiVector = multiVector;
   }
 
   /**
    * Creates a text embeddings instance for a built-in model.
-   * @param namedSources - The model + tokenizer sources.
+   * @param namedSources - The model config (+ optional prompts / multiVector).
    * @param onDownloadProgress - Optional download progress callback (0..1).
    */
   static async fromModelName(
-    namedSources: AnyTextEmbeddingsModel,
+    namedSources: TextEmbeddingsModel,
     onDownloadProgress: (progress: number) => void = () => {}
   ): Promise<TextEmbeddingsModule> {
     try {
@@ -49,7 +56,8 @@ export class TextEmbeddingsModule extends BaseModule {
       }
       return new TextEmbeddingsModule(
         await global.loadTextEmbeddings(modelPath, tokenizerPath),
-        namedSources.prompts
+        namedSources.prompts,
+        namedSources.multiVector ?? false
       );
     } catch (error) {
       Logger.error('Load failed:', error);
@@ -78,23 +86,29 @@ export class TextEmbeddingsModule extends BaseModule {
   }
 
   /**
-   * Embed text. Returns the raw [numTokens, embeddingDim] result.
+   * Embed text. Standard models return the single pooled `Float32Array`;
+   * `multiVector` models return the per-token `EmbeddingResult`.
    * @param input - The text to embed.
-   * @param role - Optional 'query' | 'document'; prepends the model's prompt
-   *   for that role when configured (no-op otherwise).
+   * @param role - 'query' | 'document'; prepends the model's prompt for that
+   *   role when configured (no-op otherwise).
    */
   async forward(
     input: string,
     role?: EmbeddingRole
-  ): Promise<EmbeddingResult> {
+  ): Promise<Float32Array | EmbeddingResult> {
     if (this.nativeModule == null)
       throw new RnExecutorchError(RnExecutorchErrorCode.ModuleNotLoaded);
     const prefix = (role && this.prompts?.[role]) || '';
     const res = await this.nativeModule.generate(prefix + input);
     // res.dataPtr is already a Float32Array view over the owned native buffer
-    // (built at the JSI boundary), so use it directly — no extra copy.
+    // (built at the JSI boundary).
+    const vectors = res.dataPtr as Float32Array;
+    if (!this.multiVector) {
+      // Pooled models output [1, embeddingDim]; return that single row.
+      return vectors.subarray(0, res.embeddingDim);
+    }
     return {
-      vectors: res.dataPtr as Float32Array,
+      vectors,
       numTokens: res.numTokens,
       embeddingDim: res.embeddingDim,
       tokenIds: res.tokenIds,
diff --git a/packages/react-native-executorch/src/types/textEmbeddings.ts b/packages/react-native-executorch/src/types/textEmbeddings.ts
index 47e056794f..c013cb818b 100644
--- a/packages/react-native-executorch/src/types/textEmbeddings.ts
+++ b/packages/react-native-executorch/src/types/textEmbeddings.ts
@@ -17,17 +17,16 @@ export type TextEmbeddingsModelName =
   | 'lfm2-5-colbert-350m';
 
 /**
- * Raw text embedding output: a [numTokens, embeddingDim] fp32 matrix (row-
- * major) plus the input token ids. Single-vector (pooled) models give
- * numTokens === 1 — use `toVector` for that common case. Multi-vector (late-
- * interaction, e.g. ColBERT) models give the full per-token sequence; scoring
- * (e.g. MaxSim) is the consumer's concern.
+ * Per-token (multi-vector) embedding output for late-interaction models (e.g.
+ * ColBERT): a [numTokens, embeddingDim] fp32 matrix (row-major) plus the input
+ * token ids. Standard models return a single pooled `Float32Array` from
+ * `forward` instead; only `multiVector` models yield this.
  * @category Types
  */
 export interface EmbeddingResult {
   /** Flat [numTokens * embeddingDim] fp32 vectors (row-major). */
   vectors: Float32Array;
-  /** Number of token rows (1 for pooled models). */
+  /** Number of token rows. */
   numTokens: number;
   /** Per-token vector dimension. */
   embeddingDim: number;
@@ -54,44 +53,52 @@ export interface EmbeddingPrompts {
   document: string;
 }
 
-/** A standard (symmetric) embedding model — `forward(text)`, no role. */
+/**
+ * A text embeddings model config. Two optional flags drive `forward`:
+ * - `prompts` present  -> `forward` REQUIRES a `role` (auto-prepends the prompt)
+ * - `multiVector` true -> `forward` returns the per-token `EmbeddingResult`;
+ *                         otherwise it returns a single pooled `Float32Array`.
+ * @category Types
+ */
 export interface TextEmbeddingsModel {
   modelName: TextEmbeddingsModelName;
   modelSource: ResourceSource;
   tokenizerSource: ResourceSource;
-  prompts?: undefined;
+  prompts?: EmbeddingPrompts;
+  multiVector?: boolean;
 }
 
 /**
- * An asymmetric model with query/document prompts — `forward(text, role)` with
- * role REQUIRED.
+ * `forward`'s signature, computed from the model config:
+ * - return type: `EmbeddingResult` if `multiVector`, else `Float32Array`.
+ * - role arg: required if the model has `prompts`, else absent.
  */
-export interface PromptedTextEmbeddingsModel {
-  modelName: TextEmbeddingsModelName;
-  modelSource: ResourceSource;
-  tokenizerSource: ResourceSource;
-  prompts: EmbeddingPrompts;
-}
-
-export type AnyTextEmbeddingsModel =
-  | TextEmbeddingsModel
-  | PromptedTextEmbeddingsModel;
+export type ForwardReturn<M extends TextEmbeddingsModel> =
+  M extends { multiVector: true } ? EmbeddingResult : Float32Array;
 
 /**
- * `forward`'s signature, discriminated by the model: prompted models require a
- * `role` argument; standard models take none.
+ * `forward`'s signature, computed from the model config:
+ * - A model that DEFINITELY has prompts -> `role` is REQUIRED.
+ * - A model that definitely has NO prompts (`prompts?: undefined`) -> no role.
+ * - Otherwise (prompts optional / unknown, e.g. a heterogeneous model list) ->
+ *   `role` is OPTIONAL.
  */
-export type ForwardFn<M extends AnyTextEmbeddingsModel> =
-  M extends PromptedTextEmbeddingsModel
-    ? (input: string, role: EmbeddingRole) => Promise<EmbeddingResult>
-    : (input: string) => Promise<EmbeddingResult>;
+export type ForwardFn<M extends TextEmbeddingsModel> = M extends {
+  prompts: EmbeddingPrompts;
+}
+  ? (input: string, role: EmbeddingRole) => Promise<ForwardReturn<M>>
+  : undefined extends M['prompts']
+    ? M['prompts'] extends undefined
+      ? (input: string) => Promise<ForwardReturn<M>>
+      : (input: string, role?: EmbeddingRole) => Promise<ForwardReturn<M>>
+    : (input: string) => Promise<ForwardReturn<M>>;
 
 /**
  * Props for the useTextEmbeddings hook.
  * @category Types
  */
 export interface TextEmbeddingsProps<
-  M extends AnyTextEmbeddingsModel = AnyTextEmbeddingsModel,
+  M extends TextEmbeddingsModel = TextEmbeddingsModel,
 > {
   model: M;
   preventLoad?: boolean;
@@ -102,7 +109,7 @@ export interface TextEmbeddingsProps<
  * @category Types
  */
 export interface TextEmbeddingsType<
-  M extends AnyTextEmbeddingsModel = AnyTextEmbeddingsModel,
+  M extends TextEmbeddingsModel = TextEmbeddingsModel,
 > {
   error: null | RnExecutorchError;
   isReady: boolean;
@@ -110,10 +117,9 @@ export interface TextEmbeddingsType<
   downloadProgress: number;
 
   /**
-   * Embed text into a [numTokens, embeddingDim] result. Pooled models return
-   * numTokens === 1 (use `toVector`); multi-vector models return the full
-   * per-token sequence. Models with prompts require a `role`
-   * ('query' | 'document'); standard models take none.
+   * Embed text. Standard models return a single pooled `Float32Array`;
+   * `multiVector` models return the per-token `EmbeddingResult`. Models with
+   * `prompts` require a `role` ('query' | 'document').
    */
   forward: ForwardFn<M>;
 }

From e12fb039e08a018784d9341dc6d4ac37b88bdd3d Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 15:07:12 +0200
Subject: [PATCH 4/7] refactor: move skiplist to model config, MaxSim scoring
 to app

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/text-embeddings/app/colbert/index.tsx    | 15 +++----
 apps/text-embeddings/utils/math.ts            | 34 +++++++++++++++
 .../src/constants/modelRegistry.ts            | 10 +++++
 .../src/types/textEmbeddings.ts               |  7 ++++
 .../src/utils/textEmbeddings.ts               | 42 -------------------
 5 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/apps/text-embeddings/app/colbert/index.tsx b/apps/text-embeddings/app/colbert/index.tsx
index d686168f43..5136aad9f1 100644
--- a/apps/text-embeddings/app/colbert/index.tsx
+++ b/apps/text-embeddings/app/colbert/index.tsx
@@ -15,23 +15,18 @@ import { useIsFocused } from 'expo-router';
 import {
   models,
   useTextEmbeddings,
-  maxSim,
   EmbeddingResult,
 } from 'react-native-executorch';
 import ColorPalette from '../../colors';
 import ErrorBanner from '../../components/ErrorBanner';
+import { maxSim } from '../../utils/math';
 
 const colbertModel = models.text_embedding.lfm2_5_colbert_350m();
 
-// The library auto-applies the model's [Q]/[D] prompts via forward(text, role).
-// Late-interaction MaxSim is a shipped util; the document skiplist (punctuation
-// token ids excluded from scoring) is the consumer's choice — these are the
-// LFM2.5-ColBERT skiplist ids.
-const SKIPLIST = [
-  510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
-  535, 536, 537, 538, 539, 540, 541, 568, 569, 570, 571, 572, 573, 600, 601,
-  602, 603,
-];
+// The library auto-applies the model's [Q]/[D] prompts via forward(text, role)
+// and ships the document skiplist on the model config; we just pass it to the
+// shipped MaxSim util.
+const SKIPLIST = colbertModel.skiplistIds ?? [];
 
 const CORPUS: string[] = [
   'The forecast says heavy showers this afternoon.',
diff --git a/apps/text-embeddings/utils/math.ts b/apps/text-embeddings/utils/math.ts
index 50c70d1f92..997d3f46fb 100644
--- a/apps/text-embeddings/utils/math.ts
+++ b/apps/text-embeddings/utils/math.ts
@@ -1,6 +1,7 @@
 import {
   RnExecutorchError,
   RnExecutorchErrorCode,
+  EmbeddingResult,
 } from 'react-native-executorch';
 
 export const dotProduct = (a: Float32Array, b: Float32Array) => {
@@ -17,3 +18,36 @@ export const dotProduct = (a: Float32Array, b: Float32Array) => {
   }
   return sum;
 };
+
+/**
+ * ColBERT late-interaction score between a query and a document encoding:
+ *   score = Σ_q max_d ( q · d )
+ * For each query token, the max dot over non-skiplist doc tokens, summed.
+ * Per-token vectors are L2-normalized by the graph, so dot == cosine. Scoring
+ * is the consumer's concern (the library just yields the per-token vectors),
+ * so this lives in the app alongside dotProduct.
+ */
+export const maxSim = (
+  query: EmbeddingResult,
+  doc: EmbeddingResult,
+  skiplistIds: number[] = []
+) => {
+  const dim = query.embeddingDim;
+  const skip = new Set(skiplistIds);
+  let score = 0;
+  for (let qi = 0; qi < query.numTokens; qi++) {
+    const qOff = qi * dim;
+    let best = -Infinity;
+    for (let di = 0; di < doc.numTokens; di++) {
+      if (skip.has(doc.tokenIds[di]!)) continue;
+      const dOff = di * dim;
+      let dot = 0;
+      for (let k = 0; k < dim; k++) {
+        dot += (query.vectors[qOff + k] ?? 0) * (doc.vectors[dOff + k] ?? 0);
+      }
+      if (dot > best) best = dot;
+    }
+    if (best !== -Infinity) score += best;
+  }
+  return score;
+};
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index c2e3a2a21d..f57c178b5e 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -287,6 +287,14 @@ const LFM2_5_EMBEDDING_350M_VARIANTS = {
 // LFM2.5-ColBERT is a plain text-embedding model from the library's POV: it
 // returns per-token vectors. Late-interaction scoring (MaxSim / skiplist) is
 // the consumer's concern; the library only auto-applies the role prompts.
+// Document punctuation token ids excluded from MaxSim (ColBERT skiplist),
+// derived from the model's config_sentence_transformers.json skiplist_words.
+const LFM_COLBERT_SKIPLIST = [
+  510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
+  535, 536, 537, 538, 539, 540, 541, 568, 569, 570, 571, 572, 573, 600, 601,
+  602, 603,
+];
+
 const LFM2_5_COLBERT_350M_VARIANTS = {
   mlx: {
     base: {
@@ -295,6 +303,7 @@ const LFM2_5_COLBERT_350M_VARIANTS = {
       tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
       prompts: LFM_COLBERT_PROMPTS,
       multiVector: true as const,
+      skiplistIds: LFM_COLBERT_SKIPLIST,
     },
   },
   xnnpack: {
@@ -304,6 +313,7 @@ const LFM2_5_COLBERT_350M_VARIANTS = {
       tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
       prompts: LFM_COLBERT_PROMPTS,
       multiVector: true as const,
+      skiplistIds: LFM_COLBERT_SKIPLIST,
     },
   },
 };
diff --git a/packages/react-native-executorch/src/types/textEmbeddings.ts b/packages/react-native-executorch/src/types/textEmbeddings.ts
index c013cb818b..2f42d71e9d 100644
--- a/packages/react-native-executorch/src/types/textEmbeddings.ts
+++ b/packages/react-native-executorch/src/types/textEmbeddings.ts
@@ -66,6 +66,13 @@ export interface TextEmbeddingsModel {
   tokenizerSource: ResourceSource;
   prompts?: EmbeddingPrompts;
   multiVector?: boolean;
+  /**
+   * Document token ids to exclude from late-interaction scoring (e.g. ColBERT's
+   * punctuation skiplist). Derived from the model's training config, so it's
+   * shipped here rather than reconstructed by the consumer, who passes it to
+   * their own MaxSim scoring.
+   */
+  skiplistIds?: number[];
 }
 
 /**
diff --git a/packages/react-native-executorch/src/utils/textEmbeddings.ts b/packages/react-native-executorch/src/utils/textEmbeddings.ts
index da10d9aa08..e9be7cf774 100644
--- a/packages/react-native-executorch/src/utils/textEmbeddings.ts
+++ b/packages/react-native-executorch/src/utils/textEmbeddings.ts
@@ -35,45 +35,3 @@ export function getTokenVectors(result: EmbeddingResult): Float32Array[] {
   }
   return rows;
 }
-
-/**
- * Late-interaction MaxSim score between a query and a document encoding:
- *
- *   score = Σ_q  max_d ( q · d )
- *
- * For each query token, takes the max dot product over all (non-skiplist)
- * document tokens, then sums across query tokens. Per-token vectors are
- * L2-normalized by the graph, so a dot product is a cosine.
- *
- * `skiplistIds` (e.g. punctuation token ids) are excluded from the document
- * side, matching ColBERT's document skiplist. Pass `[]` to score every token.
- *
- * @category Utils
- */
-export function maxSim(
-  query: EmbeddingResult,
-  doc: EmbeddingResult,
-  skiplistIds: number[] = []
-): number {
-  const dim = query.embeddingDim;
-  const q = query.vectors;
-  const d = doc.vectors;
-  const skip = new Set(skiplistIds);
-
-  let score = 0;
-  for (let qi = 0; qi < query.numTokens; qi++) {
-    const qOff = qi * dim;
-    let best = -Infinity;
-    for (let di = 0; di < doc.numTokens; di++) {
-      if (skip.has(doc.tokenIds[di]!)) continue;
-      const dOff = di * dim;
-      let dot = 0;
-      for (let k = 0; k < dim; k++) {
-        dot += (q[qOff + k] ?? 0) * (d[dOff + k] ?? 0);
-      }
-      if (dot > best) best = dot;
-    }
-    if (best !== -Infinity) score += best;
-  }
-  return score;
-}

From d551b5f7adc5ed27b9e4bb06489c54352c0cd11f Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 16:33:01 +0200
Subject: [PATCH 5/7] refactor(example): merge ColBERT search into text
 embeddings screen

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/text-embeddings/app/_layout.tsx          |   8 -
 apps/text-embeddings/app/colbert/index.tsx    | 284 ------------------
 .../app/text-embeddings/index.tsx             |  43 ++-
 3 files changed, 33 insertions(+), 302 deletions(-)
 delete mode 100644 apps/text-embeddings/app/colbert/index.tsx

diff --git a/apps/text-embeddings/app/_layout.tsx b/apps/text-embeddings/app/_layout.tsx
index 57acb26eb2..bb8e1deeb8 100644
--- a/apps/text-embeddings/app/_layout.tsx
+++ b/apps/text-embeddings/app/_layout.tsx
@@ -109,14 +109,6 @@ export default function _layout() {
             headerTitleStyle: { color: ColorPalette.primary },
           }}
         />
-        <Drawer.Screen
-          name="colbert/index"
-          options={{
-            drawerLabel: 'ColBERT search',
-            title: 'ColBERT search',
-            headerTitleStyle: { color: ColorPalette.primary },
-          }}
-        />
       </Drawer>
     </GeneratingContext>
   );
diff --git a/apps/text-embeddings/app/colbert/index.tsx b/apps/text-embeddings/app/colbert/index.tsx
deleted file mode 100644
index 5136aad9f1..0000000000
--- a/apps/text-embeddings/app/colbert/index.tsx
+++ /dev/null
@@ -1,284 +0,0 @@
-import { useEffect, useState } from 'react';
-import {
-  StyleSheet,
-  Text,
-  TextInput,
-  TouchableOpacity,
-  View,
-  SafeAreaView,
-  ScrollView,
-  KeyboardAvoidingView,
-  Platform,
-} from 'react-native';
-import { Ionicons } from '@expo/vector-icons';
-import { useIsFocused } from 'expo-router';
-import {
-  models,
-  useTextEmbeddings,
-  EmbeddingResult,
-} from 'react-native-executorch';
-import ColorPalette from '../../colors';
-import ErrorBanner from '../../components/ErrorBanner';
-import { maxSim } from '../../utils/math';
-
-const colbertModel = models.text_embedding.lfm2_5_colbert_350m();
-
-// The library auto-applies the model's [Q]/[D] prompts via forward(text, role)
-// and ships the document skiplist on the model config; we just pass it to the
-// shipped MaxSim util.
-const SKIPLIST = colbertModel.skiplistIds ?? [];
-
-const CORPUS: string[] = [
-  'The forecast says heavy showers this afternoon.',
-  "It's so sunny outside today!",
-  'The home team scored in the final minute to win the match.',
-  'Fans packed the stadium for the championship game.',
-  'Simmer the tomatoes with garlic before adding the pasta.',
-  'He whisked the eggs and folded in the melted chocolate.',
-  'The new phone has a faster chip and a brighter screen.',
-  'The flight to Tokyo was delayed by three hours.',
-  'We hiked along the coast and camped near the cliffs.',
-];
-
-const EXAMPLE_QUERIES: string[] = [
-  "What's the weather like?",
-  'Who won the match?',
-  'How do I cook dinner?',
-  'Tell me about the latest technology',
-];
-
-type Ranked = { sentence: string; score: number };
-
-export default function ColbertScreenWrapper() {
-  return useIsFocused() ? <ColbertScreen /> : null;
-}
-
-function ColbertScreen() {
-  const model = useTextEmbeddings({ model: colbertModel });
-  const [error, setError] = useState<string | null>(null);
-  const [query, setQuery] = useState('');
-  const [docEncs, setDocEncs] = useState<
-    { sentence: string; enc: EmbeddingResult }[]
-  >([]);
-  const [results, setResults] = useState<Ranked[]>([]);
-  const [indexing, setIndexing] = useState(false);
-  const [encodeTime, setEncodeTime] = useState<number | null>(null);
-
-  useEffect(
-    () => {
-      let cancelled = false;
-      const indexCorpus = async () => {
-        if (!model.isReady) return;
-        setIndexing(true);
-        setResults([]);
-        try {
-          const encs = [];
-          for (const sentence of CORPUS) {
-            const enc = await model.forward(sentence, 'document');
-            if (cancelled) return;
-            encs.push({ sentence, enc });
-          }
-          setDocEncs(encs);
-        } catch (e) {
-          setError(e instanceof Error ? e.message : String(e));
-        } finally {
-          if (!cancelled) setIndexing(false);
-        }
-      };
-      indexCorpus();
-      return () => {
-        cancelled = true;
-      };
-    },
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-    [model.isReady]
-  );
-
-  const runSearch = async (queryText: string = query) => {
-    const q = queryText.trim();
-    if (!model.isReady || !q || docEncs.length === 0) return;
-    setQuery(queryText);
-    try {
-      const start = Date.now();
-      const qEnc = await model.forward(q, 'query');
-      setEncodeTime(Date.now() - start);
-      const ranked = docEncs
-        .map(({ sentence, enc }) => ({
-          sentence,
-          score: maxSim(qEnc, enc, SKIPLIST),
-        }))
-        .sort((a, b) => b.score - a.score);
-      setResults(ranked);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : String(e));
-    }
-  };
-
-  const ready = model.isReady && !indexing && docEncs.length > 0;
-  const canSearch = ready && !!query.trim();
-
-  const statusText = model.error
-    ? `Error: ${model.error}`
-    : !model.isReady
-      ? `Loading model ${(model.downloadProgress * 100).toFixed(0)}%`
-      : indexing
-        ? 'Indexing corpus…'
-        : 'Ready';
-
-  return (
-    <SafeAreaView style={styles.container}>
-      <KeyboardAvoidingView
-        style={styles.flex}
-        behavior={Platform.OS === 'ios' ? 'padding' : undefined}
-      >
-        <ScrollView contentContainerStyle={styles.scroll}>
-          <Text style={styles.heading}>ColBERT Late-Interaction Search</Text>
-          <Text style={styles.status}>{statusText}</Text>
-          <ErrorBanner message={error} onDismiss={() => setError(null)} />
-
-          <View style={styles.card}>
-            <Text style={styles.sectionTitle}>
-              Search the corpus ({CORPUS.length} sentences)
-            </Text>
-            <Text style={styles.hint}>
-              Per-token vectors scored with MaxSim. Tap an example or type a
-              query.
-            </Text>
-            <View style={styles.chipRow}>
-              {EXAMPLE_QUERIES.map((q) => (
-                <TouchableOpacity
-                  key={q}
-                  style={[styles.chip, !ready && styles.chipDisabled]}
-                  disabled={!ready}
-                  onPress={() => runSearch(q)}
-                >
-                  <Text style={styles.chipText}>{q}</Text>
-                </TouchableOpacity>
-              ))}
-            </View>
-            <TextInput
-              placeholder="Type a search query..."
-              placeholderTextColor="#94A3B8"
-              style={styles.input}
-              value={query}
-              onChangeText={setQuery}
-              onSubmitEditing={() => runSearch()}
-              returnKeyType="search"
-            />
-            <TouchableOpacity
-              onPress={() => runSearch()}
-              style={[styles.button, !canSearch && styles.buttonDisabled]}
-              disabled={!canSearch}
-            >
-              <Ionicons
-                name="search"
-                size={16}
-                color={!canSearch ? 'gray' : 'white'}
-              />
-              <Text style={[styles.buttonText, !canSearch && styles.buttonTextDisabled]}>
-                {indexing ? 'Indexing…' : 'Search'}
-              </Text>
-            </TouchableOpacity>
-            {encodeTime !== null && (
-              <Text style={styles.stats}>Query encoded in {encodeTime} ms</Text>
-            )}
-          </View>
-
-          {results.length > 0 && (
-            <View style={styles.card}>
-              <Text style={styles.sectionTitle}>Results</Text>
-              {results.map((r, i) => (
-                <View key={i} style={styles.resultRow}>
-                  <View style={styles.resultHeader}>
-                    <Text style={styles.resultText}>{r.sentence}</Text>
-                    <Text style={styles.resultScore}>{r.score.toFixed(2)}</Text>
-                  </View>
-                  <View style={styles.barTrack}>
-                    <View
-                      style={[
-                        styles.barFill,
-                        {
-                          width: `${Math.round(
-                            (results[0].score > 0 ? r.score / results[0].score : 0) * 100
-                          )}%`,
-                        },
-                        i === 0 && styles.barFillTop,
-                      ]}
-                    />
-                  </View>
-                </View>
-              ))}
-            </View>
-          )}
-        </ScrollView>
-      </KeyboardAvoidingView>
-    </SafeAreaView>
-  );
-}
-
-const styles = StyleSheet.create({
-  container: { flex: 1, backgroundColor: '#F8FAFC' },
-  flex: { flex: 1 },
-  scroll: { padding: 20 },
-  heading: { fontSize: 22, fontWeight: '500', marginBottom: 8, color: '#0F172A' },
-  status: { fontSize: 14, color: '#64748B', marginBottom: 12 },
-  card: {
-    backgroundColor: '#fff',
-    padding: 16,
-    borderRadius: 16,
-    borderColor: '#E2E8F0',
-    borderWidth: 2,
-    marginBottom: 20,
-  },
-  sectionTitle: { fontSize: 16, fontWeight: '500', marginBottom: 8, color: '#1E293B' },
-  hint: { fontSize: 13, color: '#64748B', marginBottom: 12, lineHeight: 18 },
-  chipRow: { flexDirection: 'row', flexWrap: 'wrap', gap: 8, marginBottom: 12 },
-  chip: {
-    backgroundColor: '#EEF2FF',
-    borderColor: '#C7D2FE',
-    borderWidth: 1,
-    borderRadius: 16,
-    paddingHorizontal: 12,
-    paddingVertical: 6,
-  },
-  chipDisabled: { opacity: 0.4 },
-  chipText: { fontSize: 13, color: 'navy' },
-  input: {
-    backgroundColor: '#F1F5F9',
-    borderRadius: 10,
-    padding: 10,
-    marginBottom: 10,
-    fontSize: 16,
-    color: '#0F172A',
-    minHeight: 40,
-  },
-  button: {
-    backgroundColor: 'navy',
-    borderRadius: 10,
-    paddingVertical: 12,
-    flexDirection: 'row',
-    alignItems: 'center',
-    justifyContent: 'center',
-  },
-  buttonDisabled: { backgroundColor: '#f0f0f0' },
-  buttonText: { color: '#fff', fontWeight: '500', marginLeft: 6 },
-  buttonTextDisabled: { color: 'gray' },
-  stats: { fontSize: 13, color: '#64748B', marginTop: 8, textAlign: 'center' },
-  resultRow: { marginBottom: 14 },
-  resultHeader: {
-    flexDirection: 'row',
-    justifyContent: 'space-between',
-    marginBottom: 6,
-    gap: 8,
-  },
-  resultText: { flex: 1, fontSize: 14, color: '#334155' },
-  resultScore: {
-    fontSize: 14,
-    fontWeight: '600',
-    color: '#0F172A',
-    fontVariant: ['tabular-nums'],
-  },
-  barTrack: { height: 8, borderRadius: 4, backgroundColor: '#E2E8F0', overflow: 'hidden' },
-  barFill: { height: '100%', borderRadius: 4, backgroundColor: '#94A3B8' },
-  barFillTop: { backgroundColor: 'navy' },
-});
diff --git a/apps/text-embeddings/app/text-embeddings/index.tsx b/apps/text-embeddings/app/text-embeddings/index.tsx
index 8cb6777843..c2e3d14e29 100644
--- a/apps/text-embeddings/app/text-embeddings/index.tsx
+++ b/apps/text-embeddings/app/text-embeddings/index.tsx
@@ -16,12 +16,15 @@ import {
   models,
   useTextEmbeddings,
   TextEmbeddingsProps,
+  EmbeddingResult,
 } from 'react-native-executorch';
 const textEmbedding = models.text_embedding;
 
-// Single-vector (pooled) models: forward() returns a Float32Array directly.
-// The multi-vector ColBERT model has its own screen.
+// forward() returns a Float32Array for pooled (single-vector) models and an
+// EmbeddingResult for multi-vector (late-interaction) models. We store the raw
+// return for the whole corpus and pick the scorer per model below.
 type TextEmbeddingModel = TextEmbeddingsProps['model'];
+type Encoding = Float32Array | EmbeddingResult;
 
 const MODELS: { label: string; value: TextEmbeddingModel }[] = [
   { label: 'MiniLM L6', value: textEmbedding.all_minilm_l6_v2() },
@@ -53,6 +56,10 @@ const MODELS: { label: string; value: TextEmbeddingModel }[] = [
     label: 'LFM2.5 Embedding MLX',
     value: textEmbedding.lfm2_5_embedding_350m({ backend: 'mlx' }),
   },
+  {
+    label: 'LFM2.5 ColBERT (late-interaction)',
+    value: textEmbedding.lfm2_5_colbert_350m(),
+  },
 ];
 
 // A multi-topic corpus so semantic ranking is visible: a weather query should
@@ -83,7 +90,7 @@ const EXAMPLE_QUERIES: string[] = [
   'Where did they travel?',
 ];
 import { useIsFocused } from 'expo-router';
-import { dotProduct } from '../../utils/math';
+import { dotProduct, maxSim } from '../../utils/math';
 import ErrorBanner from '../../components/ErrorBanner';
 
 export default function TextEmbeddingsScreenWrapper() {
@@ -101,9 +108,15 @@ function TextEmbeddingsScreen() {
   const model = useTextEmbeddings({ model: selectedModel });
   const [error, setError] = useState<string | null>(null);
 
+  // ColBERT-style models score per-token vectors with MaxSim and exclude
+  // punctuation tokens; pooled models score the single vector with a dot
+  // product. Both are driven off the selected model's config.
+  const isMultiVector = !!selectedModel.multiVector;
+  const skiplistIds = selectedModel.skiplistIds ?? [];
+
   const [query, setQuery] = useState('');
   const [corpusEmbeddings, setCorpusEmbeddings] = useState<
-    { sentence: string; embedding: Float32Array }[]
+    { sentence: string; embedding: Encoding }[]
   >([]);
   const [results, setResults] = useState<RankedResult[]>([]);
   const [embeddingTime, setEmbeddingTime] = useState<number | null>(null);
@@ -122,8 +135,8 @@ function TextEmbeddingsScreen() {
           const embedded = [];
           for (const sentence of CORPUS) {
             // forward(_, 'document') auto-applies the model's document prompt
-            // (a no-op for models without one). Single-vector models return
-            // a Float32Array directly.
+            // (a no-op for models without one). Pooled models return a
+            // Float32Array, multi-vector models an EmbeddingResult.
             const embedding = await model.forward(sentence, 'document');
             if (cancelled) return;
             embedded.push({ sentence, embedding });
@@ -155,12 +168,21 @@ function TextEmbeddingsScreen() {
     setQuery(queryText);
     try {
       const start = Date.now();
-      const queryEmbedding = await model.forward(q, 'query');
+      const queryEmbedding = (await model.forward(q, 'query')) as Encoding;
       setEmbeddingTime(Date.now() - start);
       const ranked = corpusEmbeddings
         .map(({ sentence, embedding }) => ({
           sentence,
-          similarity: dotProduct(queryEmbedding, embedding),
+          similarity: isMultiVector
+            ? maxSim(
+                queryEmbedding as EmbeddingResult,
+                embedding as EmbeddingResult,
+                skiplistIds
+              )
+            : dotProduct(
+                queryEmbedding as Float32Array,
+                embedding as Float32Array
+              ),
         }))
         .sort((a, b) => b.similarity - a.similarity);
       setResults(ranked);
@@ -210,8 +232,9 @@ function TextEmbeddingsScreen() {
               Search the corpus ({CORPUS.length} sentences)
             </Text>
             <Text style={styles.hint}>
-              Ranks every sentence by meaning. Ask a full question — tap an
-              example or type your own.
+              {isMultiVector
+                ? 'Ranks per-token vectors with MaxSim (late interaction). Ask a full question — tap an example or type your own.'
+                : 'Ranks every sentence by meaning. Ask a full question — tap an example or type your own.'}
             </Text>
             <View style={styles.chipRow}>
               {EXAMPLE_QUERIES.map((q) => (

From b91153064ca041e99295b9b0d6511979e8716ba3 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 17:19:07 +0200
Subject: [PATCH 6/7] refactor: drop empty BaseEmbeddings layer, rename
 skipList, trim comments

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../app/text-embeddings/index.tsx             | 41 ++++---------------
 apps/text-embeddings/utils/math.ts            | 12 +-----
 .../common/rnexecutorch/TokenizerModule.h     |  6 ---
 .../host_objects/JsiConversions.h             | 13 ++----
 .../models/embeddings/BaseEmbeddings.cpp      |  9 ----
 .../models/embeddings/BaseEmbeddings.h        | 13 ------
 .../models/embeddings/text/TextEmbeddings.cpp |  2 +-
 .../models/embeddings/text/TextEmbeddings.h   |  4 +-
 .../models/text_to_image/Encoder.cpp          |  2 -
 .../common/rnexecutorch/tests/CMakeLists.txt  |  3 --
 .../src/constants/modelRegistry.ts            | 14 ++-----
 .../src/constants/modelUrls.ts                | 11 +----
 .../src/types/textEmbeddings.ts               | 11 +++--
 13 files changed, 29 insertions(+), 112 deletions(-)
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h

diff --git a/apps/text-embeddings/app/text-embeddings/index.tsx b/apps/text-embeddings/app/text-embeddings/index.tsx
index c2e3d14e29..2c62a22922 100644
--- a/apps/text-embeddings/app/text-embeddings/index.tsx
+++ b/apps/text-embeddings/app/text-embeddings/index.tsx
@@ -5,7 +5,6 @@ import {
   TextInput,
   TouchableOpacity,
   View,
-  SafeAreaView,
   ScrollView,
   KeyboardAvoidingView,
   Platform,
@@ -18,11 +17,13 @@ import {
   TextEmbeddingsProps,
   EmbeddingResult,
 } from 'react-native-executorch';
+import { useIsFocused } from 'expo-router';
+import { dotProduct, maxSim } from '../../utils/math';
+import ErrorBanner from '../../components/ErrorBanner';
+import { SafeAreaView } from 'react-native-safe-area-context';
+
 const textEmbedding = models.text_embedding;
 
-// forward() returns a Float32Array for pooled (single-vector) models and an
-// EmbeddingResult for multi-vector (late-interaction) models. We store the raw
-// return for the whole corpus and pick the scorer per model below.
 type TextEmbeddingModel = TextEmbeddingsProps['model'];
 type Encoding = Float32Array | EmbeddingResult;
 
@@ -62,9 +63,6 @@ const MODELS: { label: string; value: TextEmbeddingModel }[] = [
   },
 ];
 
-// A multi-topic corpus so semantic ranking is visible: a weather query should
-// float the weather lines to the top and push sports/cooking/tech down, even
-// with no shared keywords.
 const CORPUS: string[] = [
   'The forecast says heavy showers this afternoon.',
   "It's so sunny outside today!",
@@ -80,8 +78,6 @@ const CORPUS: string[] = [
   'We hiked along the coast and camped near the cliffs.',
 ];
 
-// Tap-to-run example queries. Natural-language questions — how these models
-// are trained to be queried — give the cleanest separation.
 const EXAMPLE_QUERIES: string[] = [
   "What's the weather like?",
   'Who won the match?',
@@ -89,9 +85,6 @@ const EXAMPLE_QUERIES: string[] = [
   'How do I cook dinner?',
   'Where did they travel?',
 ];
-import { useIsFocused } from 'expo-router';
-import { dotProduct, maxSim } from '../../utils/math';
-import ErrorBanner from '../../components/ErrorBanner';
 
 export default function TextEmbeddingsScreenWrapper() {
   const isFocused = useIsFocused();
@@ -108,11 +101,8 @@ function TextEmbeddingsScreen() {
   const model = useTextEmbeddings({ model: selectedModel });
   const [error, setError] = useState<string | null>(null);
 
-  // ColBERT-style models score per-token vectors with MaxSim and exclude
-  // punctuation tokens; pooled models score the single vector with a dot
-  // product. Both are driven off the selected model's config.
   const isMultiVector = !!selectedModel.multiVector;
-  const skiplistIds = selectedModel.skiplistIds ?? [];
+  const skipListIds = selectedModel.skipListIds ?? [];
 
   const [query, setQuery] = useState('');
   const [corpusEmbeddings, setCorpusEmbeddings] = useState<
@@ -122,8 +112,6 @@ function TextEmbeddingsScreen() {
   const [embeddingTime, setEmbeddingTime] = useState<number | null>(null);
   const [indexing, setIndexing] = useState(false);
 
-  // Embed the whole corpus once the model is ready (re-runs on model change so
-  // prefixes / weights match the active model).
   useEffect(
     () => {
       let cancelled = false;
@@ -134,17 +122,11 @@ function TextEmbeddingsScreen() {
         try {
           const embedded = [];
           for (const sentence of CORPUS) {
-            // forward(_, 'document') auto-applies the model's document prompt
-            // (a no-op for models without one). Pooled models return a
-            // Float32Array, multi-vector models an EmbeddingResult.
             const embedding = await model.forward(sentence, 'document');
             if (cancelled) return;
             embedded.push({ sentence, embedding });
           }
           setCorpusEmbeddings(embedded);
-        } catch {
-          // A transient "Model not loaded" can fire while the hook swaps
-          // models; the effect re-runs once the new model is ready.
         } finally {
           if (!cancelled) setIndexing(false);
         }
@@ -154,10 +136,7 @@ function TextEmbeddingsScreen() {
         cancelled = true;
       };
     },
-    // Re-index when the model becomes ready OR the selected model changes, so
-    // the corpus is embedded by the active model. The "Model not loaded" race
-    // is handled by the isReady gate plus clearing the corpus on switch;
-    // switching sets isReady false→true so the re-run sees the new model.
+
     // eslint-disable-next-line react-hooks/exhaustive-deps
     [model.isReady, selectedModel]
   );
@@ -177,7 +156,7 @@ function TextEmbeddingsScreen() {
             ? maxSim(
                 queryEmbedding as EmbeddingResult,
                 embedding as EmbeddingResult,
-                skiplistIds
+                skipListIds
               )
             : dotProduct(
                 queryEmbedding as Float32Array,
@@ -201,8 +180,6 @@ function TextEmbeddingsScreen() {
     return model.isGenerating ? 'Generating...' : 'Model is ready';
   };
 
-  // Chips/examples just need a ready, indexed model; the Search button also
-  // needs a non-empty typed query.
   const ready = model.isReady && !indexing && corpusEmbeddings.length > 0;
   const canSearch = ready && !!query.trim();
 
@@ -306,8 +283,6 @@ function TextEmbeddingsScreen() {
   );
 }
 
-// One ranked result with a similarity bar. The bar is scaled relative to the
-// top hit so the ranking is visually obvious; the raw cosine is shown too.
 function ResultRow({
   sentence,
   similarity,
diff --git a/apps/text-embeddings/utils/math.ts b/apps/text-embeddings/utils/math.ts
index 997d3f46fb..44248e1658 100644
--- a/apps/text-embeddings/utils/math.ts
+++ b/apps/text-embeddings/utils/math.ts
@@ -19,21 +19,13 @@ export const dotProduct = (a: Float32Array, b: Float32Array) => {
   return sum;
 };
 
-/**
- * ColBERT late-interaction score between a query and a document encoding:
- *   score = Σ_q max_d ( q · d )
- * For each query token, the max dot over non-skiplist doc tokens, summed.
- * Per-token vectors are L2-normalized by the graph, so dot == cosine. Scoring
- * is the consumer's concern (the library just yields the per-token vectors),
- * so this lives in the app alongside dotProduct.
- */
 export const maxSim = (
   query: EmbeddingResult,
   doc: EmbeddingResult,
-  skiplistIds: number[] = []
+  skipListIds: number[] = []
 ) => {
   const dim = query.embeddingDim;
-  const skip = new Set(skiplistIds);
+  const skip = new Set(skipListIds);
   let score = 0;
   for (let qi = 0; qi < query.numTokens; qi++) {
     const qOff = qi * dim;
diff --git a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
index 09877dfc65..0e1356f121 100644
--- a/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
+++ b/packages/react-native-executorch/common/rnexecutorch/TokenizerModule.h
@@ -13,10 +13,6 @@ class TokenizerModule {
                            std::shared_ptr<react::CallInvoker> callInvoker);
   [[nodiscard("Registered non-void function")]] std::vector<uint64_t>
   encode(std::string s) const;
-  // Like encode, but applies the tokenizer.json post_processor (e.g.
-  // TemplateProcessing that prepends BOS). Needed by models whose pooling
-  // depends on the BOS/CLS token (e.g. CLS-pooled text embeddings). Not JS-
-  // bound; encode() keeps its single-arg signature for the JS API.
   [[nodiscard("Registered non-void function")]] std::vector<uint64_t>
   encodeWithSpecialTokens(std::string s) const;
   [[nodiscard("Registered non-void function")]] std::string
@@ -30,8 +26,6 @@ class TokenizerModule {
   std::size_t getMemoryLowerBound() const noexcept;
 
 private:
-  // Shared encode implementation. bos/eos act as an add-special-tokens flag
-  // (not a literal count) when the tokenizer.json defines a post_processor.
   std::vector<uint64_t> encodeImpl(const std::string &s, int8_t bos,
                                    int8_t eos) const;
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 8e211f0028..fdc87cd9af 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -708,20 +708,15 @@ getJsiValue(const models::style_transfer::PixelDataResult &result,
   return obj;
 }
 
-// Text embedding output: a [numTokens, embeddingDim] fp32 matrix + input token
-// ids. Pooled models give numTokens == 1; multi-vector give the full sequence.
-// The TS layer reduces to a single vector or keeps the matrix per model config.
-inline jsi::Value
-getJsiValue(const models::embeddings::EmbeddingResult &result,
-            jsi::Runtime &runtime) {
+inline jsi::Value getJsiValue(const models::embeddings::EmbeddingResult &result,
+                              jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
 
   auto arrayBuffer = jsi::ArrayBuffer(runtime, result.dataPtr);
   auto float32ArrayCtor =
       runtime.global().getPropertyAsFunction(runtime, "Float32Array");
-  auto float32Array =
-      float32ArrayCtor.callAsConstructor(runtime, arrayBuffer)
-          .getObject(runtime);
+  auto float32Array = float32ArrayCtor.callAsConstructor(runtime, arrayBuffer)
+                          .getObject(runtime);
   obj.setProperty(runtime, "dataPtr", float32Array);
 
   obj.setProperty(runtime, "numTokens", jsi::Value(result.numTokens));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp
deleted file mode 100644
index e777be6704..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "BaseEmbeddings.h"
-
-namespace rnexecutorch::models::embeddings {
-
-BaseEmbeddings::BaseEmbeddings(const std::string &modelSource,
-                               std::shared_ptr<react::CallInvoker> callInvoker)
-    : BaseModel(modelSource, callInvoker) {}
-
-} // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h
deleted file mode 100644
index 4b37a3fe93..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/BaseEmbeddings.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include <rnexecutorch/models/BaseModel.h>
-
-namespace rnexecutorch::models::embeddings {
-
-class BaseEmbeddings : public BaseModel {
-public:
-  BaseEmbeddings(const std::string &modelSource,
-                 std::shared_ptr<react::CallInvoker> callInvoker);
-};
-
-}; // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
index 26f3157690..d80c4fb4fe 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
@@ -11,7 +11,7 @@ using namespace executorch::extension;
 TextEmbeddings::TextEmbeddings(const std::string &modelSource,
                                const std::string &tokenizerSource,
                                std::shared_ptr<react::CallInvoker> callInvoker)
-    : BaseEmbeddings(modelSource, callInvoker),
+    : BaseModel(modelSource, callInvoker),
       tokenizer(
           std::make_unique<TokenizerModule>(tokenizerSource, callInvoker)) {}
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
index cb6059b96e..da51e4d26e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
@@ -3,7 +3,7 @@
 #include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
 #include <mutex>
 #include <rnexecutorch/TokenizerModule.h>
-#include <rnexecutorch/models/embeddings/BaseEmbeddings.h>
+#include <rnexecutorch/models/BaseModel.h>
 #include <rnexecutorch/models/embeddings/Types.h>
 
 namespace rnexecutorch {
@@ -14,7 +14,7 @@ struct TokenIdsWithAttentionMask {
   std::vector<int64_t> attentionMask;
 };
 
-class TextEmbeddings final : public BaseEmbeddings {
+class TextEmbeddings final : public BaseModel {
 public:
   TextEmbeddings(const std::string &modelSource,
                  const std::string &tokenizerSource,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp
index 6abbccb9c6..3bf5fa2206 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/Encoder.cpp
@@ -16,8 +16,6 @@ Encoder::Encoder(const std::string &tokenizerSource,
           encoderSource, tokenizerSource, callInvoker)) {}
 
 std::vector<float> Encoder::generate(std::string input) {
-  // TextEmbeddings returns the raw [numTokens, embeddingDim] matrix; this
-  // encoder pools/uses the flat fp32 buffer directly (dataPtr).
   std::shared_ptr<OwningArrayBuffer> embeddingsText =
       encoder->generate(input).dataPtr;
   std::shared_ptr<OwningArrayBuffer> embeddingsUncond =
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 5f9d7287a5..a901cd56fc 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -218,7 +218,6 @@ add_rn_test(ObjectDetectionTests integration/ObjectDetectionTest.cpp
 add_rn_test(ImageEmbeddingsTests integration/ImageEmbeddingsTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/embeddings/image/ImageEmbeddings.cpp
-        ${RNEXECUTORCH_DIR}/models/embeddings/BaseEmbeddings.cpp
         ${RNEXECUTORCH_DIR}/models/VisionModel.cpp
         ${RNEXECUTORCH_DIR}/utils/FrameProcessor.cpp
         ${RNEXECUTORCH_DIR}/utils/FrameExtractor.cpp
@@ -230,7 +229,6 @@ add_rn_test(ImageEmbeddingsTests integration/ImageEmbeddingsTest.cpp
 add_rn_test(TextEmbeddingsTests integration/TextEmbeddingsTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/embeddings/text/TextEmbeddings.cpp
-        ${RNEXECUTORCH_DIR}/models/embeddings/BaseEmbeddings.cpp
         ${TOKENIZER_SOURCES}
     LIBS tokenizers_deps
 )
@@ -306,7 +304,6 @@ add_rn_test(TextToImageTests integration/TextToImageTest.cpp
         ${RNEXECUTORCH_DIR}/models/text_to_image/Decoder.cpp
         ${RNEXECUTORCH_DIR}/models/text_to_image/Scheduler.cpp
         ${RNEXECUTORCH_DIR}/models/embeddings/text/TextEmbeddings.cpp
-        ${RNEXECUTORCH_DIR}/models/embeddings/BaseEmbeddings.cpp
         ${TOKENIZER_SOURCES}
     LIBS tokenizers_deps
 )
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index f57c178b5e..4c36c6a1fa 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -284,12 +284,7 @@ const LFM2_5_EMBEDDING_350M_VARIANTS = {
   },
 };
 
-// LFM2.5-ColBERT is a plain text-embedding model from the library's POV: it
-// returns per-token vectors. Late-interaction scoring (MaxSim / skiplist) is
-// the consumer's concern; the library only auto-applies the role prompts.
-// Document punctuation token ids excluded from MaxSim (ColBERT skiplist),
-// derived from the model's config_sentence_transformers.json skiplist_words.
-const LFM_COLBERT_SKIPLIST = [
+const LFM_COLBERT_SKIP_LIST = [
   510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
   535, 536, 537, 538, 539, 540, 541, 568, 569, 570, 571, 572, 573, 600, 601,
   602, 603,
@@ -303,7 +298,7 @@ const LFM2_5_COLBERT_350M_VARIANTS = {
       tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
       prompts: LFM_COLBERT_PROMPTS,
       multiVector: true as const,
-      skiplistIds: LFM_COLBERT_SKIPLIST,
+      skipListIds: LFM_COLBERT_SKIP_LIST,
     },
   },
   xnnpack: {
@@ -313,7 +308,7 @@ const LFM2_5_COLBERT_350M_VARIANTS = {
       tokenizerSource: M.LFM2_5_COLBERT_350M_TOKENIZER,
       prompts: LFM_COLBERT_PROMPTS,
       multiVector: true as const,
-      skiplistIds: LFM_COLBERT_SKIPLIST,
+      skipListIds: LFM_COLBERT_SKIP_LIST,
     },
   },
 };
@@ -804,9 +799,6 @@ export const models = {
       ios: 'mlx',
       android: 'xnnpack',
     }),
-    // ColBERT (late-interaction): forward() returns per-token vectors. Scoring
-    // (markers / MaxSim / skiplist) is the consumer's concern — see the
-    // colbert example screen for a reference implementation.
     lfm2_5_colbert_350m: variant(LFM2_5_COLBERT_350M_VARIANTS, {
       ios: 'mlx',
       android: 'xnnpack',
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 8fdebb1a6d..bd6cddf4a3 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -1195,21 +1195,14 @@ export const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W_MODEL = `${URL_PREFIX}-d
 export const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
 const PARAPHRASE_MULTILINGUAL_MINILM_L12_V2_QUANTIZED_MODEL = `${URL_PREFIX}-paraphrase-multilingual-MiniLM-L12-v2/${PREVIOUS_VERSION_TAG}/xnnpack/paraphrase_multilingual_minilm_l12_v2_xnnpack_8da4w.pte`;
 const PARAPHRASE_MULTILINGUAL_MINILM_L12_V2_TOKENIZER = `${URL_PREFIX}-paraphrase-multilingual-MiniLM-L12-v2/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${PREVIOUS_VERSION_TAG}/xnnpack/clip_vit_base_patch32_text_xnnpack_fp32.pte`;
-const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-// LFM2.5-Embedding-350M: XNNPACK 8da4w (Android/CPU), MLX int4 bf16 (iOS GPU,
-// physical device only). The exported graph bakes in CLS pooling + L2 norm.
-// Requires the runner to add the BOS special token (CLS-pooled at index 0).
 export const LFM2_5_EMBEDDING_350M_XNNPACK_MODEL = `${URL_PREFIX}-lfm2.5-embedding-350m/${PREVIOUS_VERSION_TAG}/xnnpack/lfm_2_5_embedding_350m_xnnpack_8da4w.pte`;
 export const LFM2_5_EMBEDDING_350M_MLX_MODEL = `${URL_PREFIX}-lfm2.5-embedding-350m/${PREVIOUS_VERSION_TAG}/mlx/lfm_2_5_embedding_350m_mlx_int4.pte`;
 export const LFM2_5_EMBEDDING_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-embedding-350m/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-// LFM2.5-ColBERT-350M: late-interaction multi-vector retriever (per-token
-// [S,128]). Same bidirectional backbone as the embedding model + a Linear
-// 1024->128 head. forward() returns per-token vectors; late-interaction
-// scoring (MaxSim) is the consumer's concern (see the colbert example).
 export const LFM2_5_COLBERT_350M_XNNPACK_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/${PREVIOUS_VERSION_TAG}/xnnpack/lfm_2_5_colbert_350m_xnnpack_8da4w.pte`;
 export const LFM2_5_COLBERT_350M_MLX_MODEL = `${URL_PREFIX}-lfm2.5-colbert-350m/${PREVIOUS_VERSION_TAG}/mlx/lfm_2_5_colbert_350m_mlx_int4.pte`;
 export const LFM2_5_COLBERT_350M_TOKENIZER = `${URL_PREFIX}-lfm2.5-colbert-350m/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
+const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${PREVIOUS_VERSION_TAG}/xnnpack/clip_vit_base_patch32_text_xnnpack_fp32.pte`;
+const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
 
 /**
  * @category Models - Text Embeddings
diff --git a/packages/react-native-executorch/src/types/textEmbeddings.ts b/packages/react-native-executorch/src/types/textEmbeddings.ts
index 2f42d71e9d..1b056a1f7b 100644
--- a/packages/react-native-executorch/src/types/textEmbeddings.ts
+++ b/packages/react-native-executorch/src/types/textEmbeddings.ts
@@ -68,11 +68,11 @@ export interface TextEmbeddingsModel {
   multiVector?: boolean;
   /**
    * Document token ids to exclude from late-interaction scoring (e.g. ColBERT's
-   * punctuation skiplist). Derived from the model's training config, so it's
+   * punctuation skipList). Derived from the model's training config, so it's
    * shipped here rather than reconstructed by the consumer, who passes it to
    * their own MaxSim scoring.
    */
-  skiplistIds?: number[];
+  skipListIds?: number[];
 }
 
 /**
@@ -80,8 +80,11 @@ export interface TextEmbeddingsModel {
  * - return type: `EmbeddingResult` if `multiVector`, else `Float32Array`.
  * - role arg: required if the model has `prompts`, else absent.
  */
-export type ForwardReturn<M extends TextEmbeddingsModel> =
-  M extends { multiVector: true } ? EmbeddingResult : Float32Array;
+export type ForwardReturn<M extends TextEmbeddingsModel> = M extends {
+  multiVector: true;
+}
+  ? EmbeddingResult
+  : Float32Array;
 
 /**
  * `forward`'s signature, computed from the model config:

From 9691184b595c9fede86d8fdf472f8963b9e791d4 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 22 Jun 2026 17:42:29 +0200
Subject: [PATCH 7/7] refactor: extract TextEmbeddings::buildResult, validate
 output rank

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../models/embeddings/text/TextEmbeddings.cpp | 51 +++++++++++--------
 .../models/embeddings/text/TextEmbeddings.h   |  2 +
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
index d80c4fb4fe..6e5982c2a5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
@@ -60,34 +60,41 @@ EmbeddingResult TextEmbeddings::generate(const std::string input) {
   auto forwardResult = BaseModel::forward({tokenIds, attnMask});
   CHECK_OK_OR_THROW_FORWARD_ERROR(forwardResult);
 
-  // Output is [1, numTokens, embeddingDim] (numTokens == 1 for pooled models,
-  // == sequence length for multi-vector models). Return the raw matrix + the
-  // input ids; the TS layer reduces to a single vector or keeps the matrix.
-  auto out = forwardResult->at(0).toTensor();
-  auto sizes = out.sizes();
+  return buildResult(forwardResult->at(0).toTensor(),
+                     std::move(preprocessed.inputIds));
+}
 
-  EmbeddingResult result;
-  result.dataPtr = std::make_shared<OwningArrayBuffer>(out.const_data_ptr(),
-                                                       out.nbytes());
-  result.numTokens = static_cast<int32_t>(sizes[sizes.size() - 2]);
-  result.embeddingDim = static_cast<int32_t>(sizes[sizes.size() - 1]);
-  result.tokenIds = std::move(preprocessed.inputIds);
+// Output is [1, numTokens, embeddingDim] (numTokens == 1 for pooled models,
+// == sequence length for multi-vector models). Multi-vector consumers index
+// tokenIds[i] per output row (e.g. skiplist masking), so numTokens must match
+// the input token count or that alignment silently breaks.
+EmbeddingResult
+TextEmbeddings::buildResult(const executorch::aten::Tensor &output,
+                            std::vector<int64_t> tokenIds) {
+  auto sizes = output.sizes();
+  if (sizes.size() < 2) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidModelOutput,
+                            "Embedding output must be at least 2D, got rank " +
+                                std::to_string(sizes.size()));
+  }
 
-  // Invariant for multi-vector models: one output row per input token, so
-  // numTokens (from the output tensor) must equal tokenIds.size() (from the
-  // input). Consumers index tokenIds[i] per output row (e.g. skiplist masking),
-  // which silently breaks if the graph ever pads/truncates the sequence.
-  // (Pooled models legitimately collapse to numTokens == 1.)
-  if (result.numTokens != 1 &&
-      result.numTokens != static_cast<int32_t>(result.tokenIds.size())) {
+  const auto numTokens = static_cast<int32_t>(sizes[sizes.size() - 2]);
+  const auto inputTokens = static_cast<int32_t>(tokenIds.size());
+  if (numTokens != 1 && numTokens != inputTokens) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidModelOutput,
-        "Embedding output rows (" + std::to_string(result.numTokens) +
-            ") != input tokens (" +
-            std::to_string(result.tokenIds.size()) +
+        "Embedding output rows (" + std::to_string(numTokens) +
+            ") != input tokens (" + std::to_string(inputTokens) +
             "); per-token tokenIds alignment is broken.");
   }
-  return result;
+
+  return EmbeddingResult{
+      .dataPtr = std::make_shared<OwningArrayBuffer>(output.const_data_ptr(),
+                                                     output.nbytes()),
+      .numTokens = numTokens,
+      .embeddingDim = static_cast<int32_t>(sizes[sizes.size() - 1]),
+      .tokenIds = std::move(tokenIds),
+  };
 }
 
 } // namespace rnexecutorch::models::embeddings
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
index da51e4d26e..02cfefde4d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
@@ -31,6 +31,8 @@ class TextEmbeddings final : public BaseModel {
   mutable std::mutex inference_mutex_;
   std::vector<std::vector<int32_t>> inputShapes;
   TokenIdsWithAttentionMask preprocess(const std::string &input);
+  static EmbeddingResult buildResult(const executorch::aten::Tensor &output,
+                                     std::vector<int64_t> tokenIds);
   std::unique_ptr<TokenizerModule> tokenizer;
 };
 } // namespace models::embeddings