diff --git a/python/mcp-ollama-rag/README.md b/python/mcp-ollama-rag/README.md
index 77363f8..d977eb4 100644
--- a/python/mcp-ollama-rag/README.md
+++ b/python/mcp-ollama-rag/README.md
@@ -22,11 +22,12 @@ components:
 Function)
 - **MCPServer Class**: FastMCP-based server implementing HTTP-streamable MCP
 protocol
-- **MCP Tools**: Three primary tools for Ollama interaction:
+- **MCP Tools**: Four tools for Ollama interaction:
   - `list_models`: Enumerate available models on the Ollama server
   - `pull_model`: Download and install new models
-  - `call_model`: Send prompts to models and receive responses
-  - `rag_document`: RAG a document - accepts urls or text (strings)
+  - `embed_document`: Embed documents for RAG - accepts URLs or text strings,
+    automatically chunks to fit the embedding model's context window
+  - `call_model`: Send prompts to models with RAG context and receive responses
 
 
 ## Setup
@@ -42,7 +43,7 @@ protocol
     ```bash
 
     # optionally setup venv
-    pythom -m venv venv
+    python -m venv venv
     source venv/bin/activate
 
     # and install deps
@@ -57,8 +58,9 @@ protocol
     # Start Ollama service (in different terminal/ in bg)
     ollama serve
 
-    # Pull a model (optional, can be done via MCP tool)
+    # Pull models
     ollama pull llama3.2:3b
+    ollama pull mxbai-embed-large
     ```
 
 Now you have a running Ollama Server
@@ -83,7 +85,7 @@ ollama server and call the (now) specialized inference model with prompts.
 
 Now you've connected via MCP protocol to the running function, using an MCP client
 which has embedded a document into vector space for RAG tooling and prompted the
-model which can use the embeddings to answer your question (hopefuly) in a more
+model which can use the embeddings to answer your question (hopefully) in a more
 sophisticated manner.
 
 ### Deployment to cluster (not tested)
@@ -109,3 +111,6 @@ or portforwarding etc.
 - Verify model availability with `ollama list`
 - Confirm function is running on expected port (default: 8080)
 
+**HuggingFace rate limits:**
+The tokenizer for document chunking is downloaded from HuggingFace Hub on first
+use. If you hit rate limits, log in with `huggingface-cli login`.
diff --git a/python/mcp-ollama-rag/client/client.py b/python/mcp-ollama-rag/client/client.py
index 987963d..f167a30 100644
--- a/python/mcp-ollama-rag/client/client.py
+++ b/python/mcp-ollama-rag/client/client.py
@@ -1,12 +1,6 @@
 import asyncio
 from mcp import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
-import json
-
-from mcp.types import CallToolResult
-
-def unload_list_models(models: CallToolResult) -> list[str]:
-    return [json.loads(item.text)["model"] for item in models.content if item.text.strip().startswith('{')] #pyright: ignore
 
 async def main():
     # check your running Function MCP Server, it will output where its available
@@ -15,12 +9,12 @@ async def main():
         read_stream,write_stream = streams[0],streams[1]
 
         async with ClientSession(read_stream,write_stream) as sess:
-            print("Initializing connection...",end="")
-            await sess.initialize()
+            print("Initializing connection...", end="", flush=True)
+            _ = await sess.initialize()
             print("done!\n")
 
-
             # embed some documents
+            print("Embedding documents (this may take a moment)...", flush=True)
             embed = await sess.call_tool(
                 name="embed_document",
                 arguments={
@@ -30,17 +24,17 @@ async def main():
                         ],
                     }
                 )
-            print(embed.content[0].text) # pyright: ignore[reportAttributeAccessIssue]
+            print(embed.content[0].text)  # pyright: ignore[reportAttributeAccessIssue]
             print("-"*60)
 
             # prompt the inference model
+            prompt = "What actually is a Knative Function?"
+            print(f"Querying: \"{prompt}\"", flush=True)
             resp = await sess.call_tool(
                 name="call_model",
-                arguments={
-                    "prompt": "What actually is a Knative Function?",
-                }
+                arguments={"prompt": prompt},
             )
-            print(resp.content[0].text)
+            print(resp.content[0].text) # pyright: ignore[reportAttributeAccessIssue]
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/python/mcp-ollama-rag/function/func.py b/python/mcp-ollama-rag/function/func.py
index 0714e41..9293dd3 100644
--- a/python/mcp-ollama-rag/function/func.py
+++ b/python/mcp-ollama-rag/function/func.py
@@ -2,28 +2,25 @@
 
 # Function as an MCP Server implementation
 import logging
+import uuid
 
 from mcp.server.fastmcp import FastMCP
 import ollama
 import asyncio
 import chromadb
-import requests
+
+from .parser import resolve_input, chunk_text
+
+# Silence noisy library loggers
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("mcp").setLevel(logging.WARNING)
 
 def new():
-    """ New is the only method that must be implemented by a Function.
+    """New is the only method that must be implemented by a Function.
     The instance returned can be of any name.
     """
     return Function()
 
-# Accepts any url link which points to a raw data (*.md/text files etc.)
-# example: https://raw.githubusercontent.com/knative/func/main/docs/function-templates/python.md
-def get_raw_content(url: str) -> str:
-    """ retrieve contents of github raw url as a text """
-    response = requests.get(url)
-    response.raise_for_status() # errors if bad response
-    print(f"fetch '{url}' - ok")
-    return response.text
-
 class MCPServer:
     """
     MCP server that exposes a chat with an LLM model running on Ollama server
@@ -39,16 +36,18 @@ def __init__(self):
 
         self.client = ollama.Client()
 
-        #init database stuff
+        # init vector database
         self.dbClient = chromadb.Client()
-        self.collection = self.dbClient.create_collection(name="my_collection")
+        self.collection = self.dbClient.get_or_create_collection(
+            name="my_collection"
+        )
         # default embedding model
         self.embedding_model = "mxbai-embed-large"
-        # call this after self.embedding_model assignment, so its defined
         self._register_tools()
 
     def _register_tools(self):
         """Register MCP tools."""
+
         @self.mcp.tool()
         def list_models():
             """List all models currently available on the Ollama server"""
@@ -56,52 +55,48 @@ def list_models():
                 models = self.client.list()
             except Exception as e:
                 return f"Oops, failed to list models because: {str(e)}"
-            #return [model['name'] for model in models['models']]
             return [model for model in models]
 
         default_embedding_model = self.embedding_model
+
         @self.mcp.tool()
-        def embed_document(data:list[str],model:str = default_embedding_model) -> str:
+        def embed_document(
+            data: list[str], model: str = default_embedding_model
+        ) -> str:
             """
             RAG (Retrieval-augmented generation) tool.
-            Embeds documents provided in data.
-            Arguments:
-            - data: expected to be of type str|list.
-            - model: embedding model to use, examples below.
-
-            # example embedding models:
-            # mxbai-embed-large - 334M *default
-            # nomic-embed-text - 137M
-            # all-minilm - 23M
+            Embeds documents provided in data. Each item can be a URL
+            (fetched automatically) or a raw text string. Documents are
+            chunked to fit the embedding model's context window.
+
+            Args:
+                data: List of URLs or text strings to embed.
+                model: Embedding model to use. Example:
+                    - mxbai-embed-large - default
             """
-            count = 0
-
-            ############ TODO -- import im a separate file
-            # documents generator
-            #documents_gen = parse_data_generator(data)
-            #### 1) GENERATE
-            # generate vector embeddings via embedding model
-            #for i, d in enumerate(documents_gen):
-            #    response = ollama.embed(model=model,input=d)
-            #    embeddings = response["embeddings"]
-            #    self.collection.add(
-            #            ids=[str(i)],
-            #            embeddings=embeddings,
-            #            documents=[d]
-            #            )
-            #    count += 1
-
-            # for simplicity (until the above is resolved, this accecpts only URLs)
-            for i, d in enumerate(data):
-                response = ollama.embed(model=model,input=get_raw_content(d))
-                embeddings = response["embeddings"]
-                self.collection.add(
-                        ids=[str(i)],
-                        embeddings=embeddings,
-                        documents=[d]
-                        )
-                count += 1
-            return f"ok - Embedded {count} documents"
+            all_chunks = []
+            for item in data:
+                content = resolve_input(item)
+                chunks = chunk_text(content)
+                all_chunks.extend(chunks)
+                label = item[:60] + "..." if len(item) > 60 else item
+                print(f"  Chunked '{label}' into {len(chunks)} chunks", flush=True)
+
+            # Batch embed all chunks in one call for performance
+            print(f"  Embedding {len(all_chunks)} chunks...", flush=True)
+            response = ollama.embed(model=model, input=all_chunks)
+            ids = [str(uuid.uuid4()) for _ in all_chunks]
+            self.collection.add(
+                ids=ids,
+                embeddings=response["embeddings"],
+                documents=all_chunks,
+            )
+            print("  Done.", flush=True)
+
+            return (
+                f"ok - Embedded {len(data)} document(s) "
+                f"as {len(all_chunks)} chunks"
+            )
 
         @self.mcp.tool()
         def pull_model(model: str) -> str:
@@ -113,42 +108,42 @@ def pull_model(model: str) -> str:
             return f"Success! model {model} is available"
 
         @self.mcp.tool()
-        def call_model(prompt: str,
-                       model: str = "llama3.2:3b",
-                       embed_model: str = self.embedding_model) -> str:
-            """Send a prompt to a model being served on ollama server"""
-            #### 2) RETRIEVE
-            # we embed the prompt but dont save it into db, then we retrieve
-            # the most relevant document (most similar vectors)
+        def call_model(
+            prompt: str,
+            model: str = "llama3.2:3b",
+            embed_model: str = self.embedding_model,
+        ) -> str:
+            """Send a prompt to a model being served on ollama server.
+            Uses RAG to find the most relevant embedded documents and
+            includes them as context for the response."""
             try:
-                response = ollama.embed(
-                        model=embed_model,
-                        input=prompt
-                        )
+                # Embed the prompt for similarity search
+                response = ollama.embed(model=embed_model, input=prompt)
                 results = self.collection.query(
-                        query_embeddings=response["embeddings"],
-                        n_results=1
-                        )
-                data = results['documents'][0][0]
+                    query_embeddings=response["embeddings"],
+                    n_results=3,
+                )
+                context = "\n\n".join(results["documents"][0])
 
-            #### 3) GENERATE
-            # generate answer given a combination of prompt and data retrieved
                 output = ollama.generate(
-                        model=model,
-                        prompt=f'Using data: {data}, respond to prompt: {prompt}'
-                        )
-                print(output)
+                    model=model,
+                    prompt=(
+                        f"Using the following context:\n{context}\n\n"
+                        f"Respond to: {prompt}"
+                    ),
+                )
             except Exception as e:
                 return f"Error occurred during calling the model: {str(e)}"
-            return output['response']
+            return output["response"]
 
     async def handle(self, scope, receive, send):
         """Handle ASGI requests - both lifespan and HTTP."""
         await self._app(scope, receive, send)
 
+
 class Function:
     def __init__(self):
-        """ The init method is an optional method where initialization can be
+        """The init method is an optional method where initialization can be
         performed. See the start method for a startup hook which includes
         configuration.
         """
@@ -166,7 +161,7 @@ async def handle(self, scope, receive, send):
             await self._initialize_mcp()
 
         # Route MCP requests
-        if scope['path'].startswith('/mcp'):
+        if scope.get("path", "").startswith("/mcp"):
             await self.mcp_server.handle(scope, receive, send)
             return
 
@@ -175,26 +170,28 @@ async def handle(self, scope, receive, send):
 
     async def _initialize_mcp(self):
         """Initialize the MCP server by sending lifespan startup event."""
-        lifespan_scope = {'type': 'lifespan', 'asgi': {'version': '3.0'}}
+        lifespan_scope = {"type": "lifespan", "asgi": {"version": "3.0"}}
         startup_sent = False
 
         async def lifespan_receive():
             nonlocal startup_sent
             if not startup_sent:
                 startup_sent = True
-                return {'type': 'lifespan.startup'}
+                return {"type": "lifespan.startup"}
             await asyncio.Event().wait()  # Wait forever for shutdown
 
         async def lifespan_send(message):
-            if message['type'] == 'lifespan.startup.complete':
+            if message["type"] == "lifespan.startup.complete":
                 self._mcp_initialized = True
-            elif message['type'] == 'lifespan.startup.failed':
+            elif message["type"] == "lifespan.startup.failed":
                 logging.error(f"MCP startup failed: {message}")
 
         # Start lifespan in background
-        asyncio.create_task(self.mcp_server.handle(
-            lifespan_scope, lifespan_receive, lifespan_send
-        ))
+        asyncio.create_task(
+            self.mcp_server.handle(
+                lifespan_scope, lifespan_receive, lifespan_send
+            )
+        )
 
         # Brief wait for startup completion
         await asyncio.sleep(0.1)
@@ -204,15 +201,19 @@ async def _send_default_response(self, send):
         Send default OK response.
         This is for your non MCP requests if desired.
         """
-        await send({
-            'type': 'http.response.start',
-            'status': 200,
-            'headers': [[b'content-type', b'text/plain']],
-        })
-        await send({
-            'type': 'http.response.body',
-            'body': b'OK',
-        })
+        await send(
+            {
+                "type": "http.response.start",
+                "status": 200,
+                "headers": [[b"content-type", b"text/plain"]],
+            }
+        )
+        await send(
+            {
+                "type": "http.response.body",
+                "body": b"OK",
+            }
+        )
 
     def start(self, cfg):
         logging.info("Function starting")
diff --git a/python/mcp-ollama-rag/function/parser.py b/python/mcp-ollama-rag/function/parser.py
index c888d01..bba2c04 100644
--- a/python/mcp-ollama-rag/function/parser.py
+++ b/python/mcp-ollama-rag/function/parser.py
@@ -1,54 +1,72 @@
 import requests
 from urllib.parse import urlparse
 
-def parse_data_generator(data):
-    """
-    Generator that yields documents one at a time.
-    Handles any combination of urls and data strings.
-    Can be of type str|list.
-    example:
-    ["<url1>","<url2>","long data string"] etc.
-    """
+_tokenizer = None
+
+# The tokenizer is downloaded from HuggingFace Hub on first use and cached
+# locally. If you hit rate limits, log in with: huggingface-cli login
+# To bundle it locally instead, run:
+#   python -c "from tokenizers import Tokenizer; Tokenizer.from_pretrained('mixedbread-ai/mxbai-embed-large-v1').save('function/tokenizer.json')"
+# then use: _tokenizer = Tokenizer.from_file("function/tokenizer.json")
+_TOKENIZER_MODEL = "mixedbread-ai/mxbai-embed-large-v1"
+
 
-    # STR
-    if isinstance(data, str):
-        content = ''
-        if is_url(data):
-            content = get_raw_content(data)
-        else:
-            content = data
-        yield content.strip()
-
-    # LIST
-    elif isinstance(data, list):
-        for item in data:
-            if isinstance(item,str):
-                if is_url(item):
-                    content = get_raw_content(item)
-                else:
-                    content = item
-                yield content.strip()
-            else:
-                print(f"warning: handling item {item} as a string")
-                yield str(item)
-    else:
-        print(f"Fallback: unknown type, handling {data} as a string")
-        yield str(data)
-
-def is_url(text: str):
-    """Check if text is a valid URL"""
+def _get_tokenizer():
+    """Lazily load the tokenizer for the default embedding model."""
+    global _tokenizer
+    if _tokenizer is None:
+        from tokenizers import Tokenizer
+        _tokenizer = Tokenizer.from_pretrained(_TOKENIZER_MODEL)
+    return _tokenizer
+
+
+def is_url(text: str) -> bool:
+    """Check if text is a valid URL."""
     try:
         result = urlparse(text)
-        print(f"is_url: {result}")
         return all([result.scheme, result.netloc])
-    except:
+    except Exception:
         return False
 
-# Accepts any url link which points to a raw data (*.md/text files etc.)
-# example: https://raw.githubusercontent.com/knative/func/main/docs/function-templates/python.md
+
 def get_raw_content(url: str) -> str:
-    """ retrieve contents of github raw url as a text """
+    """Retrieve contents of a URL as text."""
     response = requests.get(url)
-    response.raise_for_status() # errors if bad response
-    print(f"fetch '{url}' - ok")
+    response.raise_for_status()
     return response.text
+
+
+def chunk_text(text: str, max_tokens: int = 480, overlap_tokens: int = 30) -> list[str]:
+    """Split text into chunks that fit within the embedding model's context
+    window, using the model's actual tokenizer for precise token counting.
+
+    Args:
+        text: The text to chunk.
+        max_tokens: Max tokens per chunk. Default 480 leaves headroom
+                    within the 512-token context of mxbai-embed-large.
+        overlap_tokens: Number of overlapping tokens between chunks.
+    """
+    tokenizer = _get_tokenizer()
+    token_ids = tokenizer.encode(text).ids
+
+    if len(token_ids) <= max_tokens:
+        return [text]
+
+    # Sliding window: advance by (max_tokens - overlap) to keep overlap between chunks
+    chunks = []
+    start = 0
+    step = max_tokens - overlap_tokens
+    while start < len(token_ids):
+        end = min(start + max_tokens, len(token_ids))
+        chunk = tokenizer.decode(token_ids[start:end])
+        chunks.append(chunk)
+        start += step
+
+    return chunks
+
+
+def resolve_input(item: str) -> str:
+    """Resolve a single input item: fetch URL content or return raw text."""
+    if is_url(item):
+        return get_raw_content(item)
+    return item
diff --git a/python/mcp-ollama-rag/pyproject.toml b/python/mcp-ollama-rag/pyproject.toml
index f538d71..dd51b9a 100644
--- a/python/mcp-ollama-rag/pyproject.toml
+++ b/python/mcp-ollama-rag/pyproject.toml
@@ -12,7 +12,8 @@ dependencies = [
   "mcp",
   "ollama",
   "requests",
-  "chromadb"
+  "chromadb",
+  "tokenizers"
 ]
 authors = [
   { name="Your Name", email="you@example.com"},
diff --git a/python/mcp-ollama-rag/tests/test_func.py b/python/mcp-ollama-rag/tests/test_func.py
index 5b37a73..10a0c8d 100644
--- a/python/mcp-ollama-rag/tests/test_func.py
+++ b/python/mcp-ollama-rag/tests/test_func.py
@@ -1,38 +1,56 @@
 """
-An example set of unit tests which confirm that the main handler (the
-callable function) returns 200 OK for a simple HTTP GET.
+Unit tests for the Function.
 """
 import pytest
 from function import new
 
-
 @pytest.mark.asyncio
-async def test_function_handle():
-    f = new()  # Instantiate Function to Test
+async def test_function_handle_default():
+    """Test that non-MCP requests get a 200 OK response."""
+    f = new()
 
     sent_ok = False
     sent_headers = False
     sent_body = False
 
-    # Mock Send
     async def send(message):
-        nonlocal sent_ok
-        nonlocal sent_headers
-        nonlocal sent_body
+        nonlocal sent_ok, sent_headers, sent_body
 
-        if message.get('status') == 200:
+        if message.get("status") == 200:
             sent_ok = True
-
-        if message.get('type') == 'http.response.start':
+        if message.get("type") == "http.response.start":
             sent_headers = True
-
-        if message.get('type') == 'http.response.body':
+        if message.get("type") == "http.response.body":
             sent_body = True
 
-    # Invoke the Function
-    await f.handle({}, {}, send)
+    scope = {"path": "/", "type": "http"}
+    await f.handle(scope, {}, send)
 
-    # Assert send was called
     assert sent_ok, "Function did not send a 200 OK"
     assert sent_headers, "Function did not send headers"
     assert sent_body, "Function did not send a body"
+
+
+@pytest.mark.asyncio
+async def test_function_routes_mcp():
+    """Test that /mcp paths are routed to MCP server."""
+    f = new()
+
+    scope = {"path": "/mcp", "type": "http", "method": "GET",
+             "headers": [], "query_string": b""}
+
+    # MCP server will handle this - we just verify no crash on routing
+    try:
+        async def receive():
+            return {"type": "http.request", "body": b""}
+
+        responses = []
+        async def send(message):
+            responses.append(message)
+
+        await f.handle(scope, receive, send)
+        # If we get here, routing worked
+        assert len(responses) > 0
+    except Exception:
+        # MCP may reject malformed requests, but routing itself worked
+        pass