From 59fa97a7256038530cdd56924707701dd9b1fb82 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Wed, 13 May 2026 17:06:36 +0200
Subject: [PATCH 1/6] added einops for embedding models and simplified accuracy
 description

---
 demos/common/export_models/README.md         | 58 +-------------------
 demos/common/export_models/requirements.txt  |  1 +
 demos/continuous_batching/accuracy/README.md | 36 ++++--------
 3 files changed, 14 insertions(+), 81 deletions(-)

diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
index 7f1276ff9e..68102cc321 100644
--- a/demos/common/export_models/README.md
+++ b/demos/common/export_models/README.md
@@ -40,62 +40,10 @@ For every use case subcommand there is adjusted list of parameters:
 ```console
 python export_model.py text_generation --help
 ```
-Expected Output:
-```console
-usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR]
-                                       [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS]
-                                       [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3_mode] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}]
-                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}] [--enable_tool_guided_generation]
 
-options:
-  -h, --help            show this help message and exit
-  --model_repository_path MODEL_REPOSITORY_PATH
-                        Where the model should be exported to
-  --source_model SOURCE_MODEL
-                        HF model name or path to the local folder with PyTorch or OpenVINO model
-  --model_name MODEL_NAME
-                        Model name that should be used in the deployment. Equal to source_model if HF model name is used
-  --weight-format PRECISION
-                        precision of the exported model
-  --config_file_path CONFIG_FILE_PATH
-                        path to the config file
-  --overwrite_models    Overwrite the model if it already exists in the models repository
-  --target_device TARGET_DEVICE
-                        CPU, GPU, NPU or HETERO, default is CPU
-  --ov_cache_dir OV_CACHE_DIR
-                        Folder path for compilation cache to speedup initialization time
-  --extra_quantization_params EXTRA_QUANTIZATION_PARAMS
-                        Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"
-  --pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}
-                        Type of the pipeline to be used. AUTO is used by default
-  --kv_cache_precision {u8}
-                        u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.
-  --enable_prefix_caching ENABLE_PREFIX_CACHING
-                        This algorithm is used to cache the prompt tokens. Default is True.
-  --disable_dynamic_split_fuse
-                        The maximum number of tokens that can be batched together.
-  --max_num_batched_tokens MAX_NUM_BATCHED_TOKENS
-                        empty or integer. The maximum number of tokens that can be batched together.
-  --max_num_seqs MAX_NUM_SEQS
-                        256 by default. The maximum number of sequences that can be processed together.
-  --cache_size CACHE_SIZE
-                        KV cache size in GB. If not set, cache is allocated dynamically.
-  --draft_source_model DRAFT_SOURCE_MODEL
-                        HF model name or path to the local folder with PyTorch or OpenVINO draft model. Using this option will create configuration for speculative decoding
-  --draft_model_name DRAFT_MODEL_NAME
-                        Draft model name that should be used in the deployment. Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.
-  --draft_eagle3_mode   Set this flag if you use EAGLE3 draft model for speculative decoding
-  --max_prompt_len MAX_PROMPT_LEN
-                        Sets NPU specific property for maximum number of tokens in the prompt. Not effective if target device is not NPU
-  --prompt_lookup_decoding
-                        Set pipeline to use prompt lookup decoding
-  --reasoning_parser {qwen3,gptoss}
-                        Set the type of the reasoning parser for reasoning content extraction
-  --tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}
-                        Set the type of the tool parser for tool calls extraction
-  --enable_tool_guided_generation
-                        Enables enforcing tool schema during generation. Requires setting tool_parser
-```
+> Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==<version>`
+
+
 
 ## Model Export Examples
 
diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt
index 60cbf3d2fc..e016048a95 100644
--- a/demos/common/export_models/requirements.txt
+++ b/demos/common/export_models/requirements.txt
@@ -14,3 +14,4 @@ sentence_transformers==5.3.0
 sentencepiece  # Required by: transformers`
 torchvision
 requests
+einops
diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md
index e67d2b98af..1cde283bc0 100644
--- a/demos/continuous_batching/accuracy/README.md
+++ b/demos/continuous_batching/accuracy/README.md
@@ -14,33 +14,17 @@ Install the framework via pip:
 pip3 install --extra-index-url "https://download.pytorch.org/whl/cpu" lm_eval[api] langdetect immutabledict dotenv openai
 ```
 
-## Exporting the models
-```bash
-git clone https://github.com/openvinotoolkit/model_server.git
-cd model_server
-pip3 install -U -r demos/common/export_models/requirements.txt
-mkdir models 
-python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3.1-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3.1-8B --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model Qwen/Qwen3-8B --model_name openvino-qwen3-8b-int8 --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser hermes3 --overwrite_models
-```
-
 ## Starting the model server
 
-### With Docker
-```bash
-docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest --rest_port 8000 --config_path /workspace/config.json
-```
 
-### On Baremetal
-```bash
-ovms --rest_port 8000 --config_path ./models/config.json
-```
+Example of LLM and VLM models deployment is documented in other demos like
+[Agentic usage for LLM models](../agentic_ai/README.md) 
+[Using VLM models](../vlm/README.md)
+
 
 ## Running the tests for LLM models
 
-```bash
+```text
 lm-eval --model local-chat-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=1,max_retries=3,tokenized_requests=False --verbosity DEBUG  --log_samples --output_path test/ --seed 1 --apply_chat_template --limit 100
 
 local-chat-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'base_url': 'http://localhost:8000/v3/chat/completions', 'num_concurrent': 10, 'max_retries': 3, 'tokenized_requests': False}), gen_kwargs: ({}), limit: 100.0, num_fewshot: None, batch_size: 1
@@ -52,7 +36,7 @@ local-chat-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'base
 
 While testing the non chat model and `completion` endpoint, the command would look like this:
 
-```bash
+```text
 lm-eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B,base_url=http://localhost:8000/v3/completions,num_concurrent=1,max_retries=3,tokenized_requests=False --verbosity DEBUG  --log_samples --output_path results/ --seed 1 --limit 100
 
 local-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B', 'base_url': 'http://localhost:8000/v3/completions', 'num_concurrent': 10, 'max_retries': 3, 'tokenized_requests': False}), gen_kwargs: ({}), limit: 100.0, num_fewshot: None, batch_size: 1
@@ -64,11 +48,11 @@ local-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B', 'base_url': 'http:/
 
 Other examples are below:
 
-```bash
+```text
 lm-eval --model local-chat-completions --tasks leaderboard_ifeval --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=10,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --limit 100 --apply_chat_template  
 ```
 
-```bash
+```text
 lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama/Meta-Llama-3.1-8B,base_url=http://localhost:8000/v3/completions,num_concurrent=10,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --limit 100
 ```
 
@@ -76,7 +60,7 @@ lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama
 
 Use [lmms-eval project](https://github.com/EvolvingLMMs-Lab/lmms-eval) - mme and mmmu_val tasks. 
 
-```bash
+```text
 export OPENAI_BASE_URL=http://localhost:8000/v3
 export OPENAI_API_KEY="unused"
 git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
@@ -85,7 +69,7 @@ git checkout 88b23e2bfa16a1edbc16e9e238ed82130b3a4f56
 pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu"
 python -m lmms_eval \
     --model openai_compatible \
-    --model_args model_version=OpenGVLab/InternVL2_5-8B,max_retries=1 \
+    --model_args model_version=OpenVINO/InternVL2_5-8B_int4-ov,max_retries=1 \
     --tasks mme,mmmu_val \
     --batch_size 1 \
     --log_samples \

From 4fb1e5b26bf36f659cbf1caa632fbc1acedc4581 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Tue, 19 May 2026 11:03:26 +0200
Subject: [PATCH 2/6] review fixes

---
 demos/continuous_batching/accuracy/README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md
index 1cde283bc0..fd6c9a44a0 100644
--- a/demos/continuous_batching/accuracy/README.md
+++ b/demos/continuous_batching/accuracy/README.md
@@ -10,7 +10,7 @@ It reports end to end quality of served model from the client application point
 ## Preparing the lm-evaluation-harness framework 
 
 Install the framework via pip:
-```bash
+```text
 pip3 install --extra-index-url "https://download.pytorch.org/whl/cpu" lm_eval[api] langdetect immutabledict dotenv openai
 ```
 
@@ -65,11 +65,10 @@ export OPENAI_BASE_URL=http://localhost:8000/v3
 export OPENAI_API_KEY="unused"
 git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
 cd lmms-eval
-git checkout 88b23e2bfa16a1edbc16e9e238ed82130b3a4f56
 pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu"
 python -m lmms_eval \
     --model openai_compatible \
-    --model_args model_version=OpenVINO/InternVL2_5-8B_int4-ov,max_retries=1 \
+    --model_args model_version=OpenVINO/InternVL2-8B_int4-ov,max_retries=1 \
     --tasks mme,mmmu_val \
     --batch_size 1 \
     --log_samples \
@@ -105,7 +104,7 @@ pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu"
 The commands below assumes the models is deployed with the name `ovms-model`. It must match the name set in the `bfcl_eval/constants/model_config.py`.
 ```text
 export OPENAI_BASE_URL=http://localhost:8000/v3
-export CHAT_TEMPLATE_KWARGS='{"enable_thinking":false, "reasoning_effort":"low"}'
+export CHAT_TEMPLATE_KWARGS='{"enable_thinking":false, "reasoning_effort":"low", "preserve_reasoning":false}'
 
 bfcl generate --model ovms-model --test-category simple_python,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir
 bfcl evaluate --model ovms-model --result-dir model_name_dir 
@@ -114,7 +113,7 @@ bfcl evaluate --model ovms-model --result-dir model_name_dir
 Alternatively, use the model name `ovms-model-stream` to run the tests with stream requests. The results should be the same.
 ```text
 export OPENAI_BASE_URL=http://localhost:8000/v3
-bfcl generate --model ovms-model-stream --test-category simple_python,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir
+bfcl generate --model ovms-model-stream --test-category simple_python,multiple,multi_turn_base --temperature 0.0 --num-threads 10 -o --result-dir model_name_dir
 bfcl evaluate --model ovms-model-stream --result-dir model_name_dir 
 ```
 
@@ -122,7 +121,7 @@ bfcl evaluate --model ovms-model-stream --result-dir model_name_dir
 The output artifacts will be stored in `result` and `scores`. For example:
 
 ```text
-cat score/openvino-qwen3-8b-int4-FC/BFCL_v3_simple_python_score.json | head -1
+cat score/openvino-qwen3-8b-int4-FC/BFCL_v4_simple_python_score.json | head -1
 {"accuracy": 0.95, "correct_count": 380, "total_count": 400}
 ```
 Those results can be compared with the reference from the [berkeley leaderbaord](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard).

From f524f2f90d2f62a8580fdf154f4236c8cfc44357 Mon Sep 17 00:00:00 2001
From: "Trawinski, Dariusz" <dariusz.trawinski@intel.com>
Date: Tue, 19 May 2026 11:06:37 +0200
Subject: [PATCH 3/6] Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 demos/common/export_models/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
index 68102cc321..d0d30b0a22 100644
--- a/demos/common/export_models/README.md
+++ b/demos/common/export_models/README.md
@@ -41,7 +41,7 @@ For every use case subcommand there is adjusted list of parameters:
 python export_model.py text_generation --help
 ```
 
-> Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==<version>`
+> Note: Exporting some models might require different transformers version than specified in requirements.txt. Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==<version>`
 
 
 

From 9f6683bb49969b924129dbf0f3c1a54407cf330e Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Tue, 19 May 2026 13:52:34 +0200
Subject: [PATCH 4/6] update readme

---
 demos/common/export_models/README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
index 68102cc321..af2ca5c202 100644
--- a/demos/common/export_models/README.md
+++ b/demos/common/export_models/README.md
@@ -42,7 +42,12 @@ python export_model.py text_generation --help
 ```
 
 > Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==<version>`
-
+Some of the exceptions include:
+- Alibaba-NLP/gte-large-en-v1.5 - `transformers<5.0`
+- OpenGVLab/InternVL - `transformers<5.0`
+- Qwen3-80B-Next and Qwen3-coder-next - `transformers<5.0`
+- gemma4 - `transformers==5.5`
+- Qwen3.5 and Qwen3.6 - `transformers==5.2`
 
 
 ## Model Export Examples
@@ -100,7 +105,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --
 #### Embeddings with `sentence_transformers` library
 Some embedding models require special handling during export. For example:
 ```console
-python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --extra_quantization_params "--library sentence_transformers" --weight-format fp16 --config_file_path models/config_all.json
+python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --extra_quantization_params "--library sentence_transformers" --pooling MEAN --weight-format fp16 --config_file_path models/config_all.json
 ```
 Known models that require it:
 - Alibaba-NLP/gte-large-en-v1.5

From 1b627f4f3237b6da614075e214ee785ecdc6d2fd Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Tue, 19 May 2026 14:03:52 +0200
Subject: [PATCH 5/6] exception and skip tests for gte model

---
 demos/embeddings/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 814e505515..2aa196ebea 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -154,7 +154,8 @@ python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-lar
 :::
 :::{tab-item} Alibaba-NLP/gte-large-en-v1.5
 :sync: gte-large-en-v1.5
-```console
+```text
+pip install "transformers<5"  # WA for optimum-intel and model support for new transformers
 python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --extra_quantization_params "--library sentence_transformers"
 ```
 :::
@@ -225,7 +226,8 @@ python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-lar
 :::
 :::{tab-item} Alibaba-NLP/gte-large-en-v1.5
 :sync: gte-large-en-v1.5
-```console
+```text
+pip install "transformers<5"  # WA for optimum-intel and model support for new transformers
 python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models
 ```
 :::

From 1b1eed274e7af549320a53b0ad02f265d50725ea Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Wed, 20 May 2026 11:45:50 +0200
Subject: [PATCH 6/6] update to latest mteb

---
 demos/embeddings/README.md    | 152 +++++++++++++++++-----------------
 demos/embeddings/ovms_mteb.py |  65 +++++----------
 2 files changed, 98 insertions(+), 119 deletions(-)

diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
index 2aa196ebea..21da78ab03 100644
--- a/demos/embeddings/README.md
+++ b/demos/embeddings/README.md
@@ -24,7 +24,7 @@ This procedure can be used to pull preconfigured models from OpenVINO organizati
 ```bash
 mkdir -p models
 docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-Embedding-0.6B-int8-ov --pooling LAST --task embeddings
-docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov
+docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov
 ```
 
 **On Bare Metal (Windows/Linux)**
@@ -40,14 +40,14 @@ ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwe
 ```bash
 docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --task embeddings
 
-docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
+docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
 ```
 
 **On Bare Metal (Windows/Linux)**
 ```console
 ovms --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --task embeddings
 
-ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
+ovms --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
 ```
 :::
 ::::
@@ -60,7 +60,7 @@ ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/bge
 ```bash
 docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-Embedding-0.6B-int8-ov --pooling LAST --target_device GPU --task embeddings
 
-docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov
+docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov
 ```
 
 **On Bare Metal (Windows/Linux)**
@@ -76,14 +76,14 @@ ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwe
 ```bash
 docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --target_device GPU --task embeddings
 
-docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
+docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
 ```
 
 **On Bare Metal (Windows/Linux)**
 ```console
 ovms --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --target_device GPU --task embeddings
 
-ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
+ovms --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov
 ```
 :::
 ::::
@@ -107,68 +107,68 @@ Run `export_model.py` script to download and quantize the model:
 :::{tab-item} BAAI/bge-large-en-v1.5
 :sync: bge-large-en-v1.5
 ```console
-python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --model_name bge-large-en-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} BAAI/bge-large-zh-v1.5
 :sync: bge-large-zh-v1.5
 ```console
-python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --model_name bge-large-zh-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} thenlper/gte-small
 :sync: gte-small
 ```console
-python export_model.py embeddings_ov --source_model thenlper/gte-small --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model thenlper/gte-small --model_name gte-small --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} sentence-transformers/all-MiniLM-L12-v2
 :sync: all-MiniLM-L12-v2
 ```console
-python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --model_name all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} sentence-transformers/all-distilroberta-v1
 :sync: all-distilroberta-v1
 ```console
-python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --model_name all-distilroberta-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} mixedbread-ai/deepset-mxbai-embed-de-large-v1
 :sync: deepset-mxbai-embed-de-large-v1
 ```console
-python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --model_name deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} intfloat/multilingual-e5-large-instruct
 :sync: multilingual-e5-large-instruc
 ```console
-python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --model_name multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} intfloat/multilingual-e5-large
 :sync: multilingual-e5-large
 ```console
-python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --model_name multilingual-e5-large --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} Alibaba-NLP/gte-large-en-v1.5
 :sync: gte-large-en-v1.5
 ```text
 pip install "transformers<5"  # WA for optimum-intel and model support for new transformers
-python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --extra_quantization_params "--library sentence_transformers"
+python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --model_name gte-large-en-v1.5 --pooling CLS --extra_quantization_params "--library sentence_transformers"
 ```
 :::
 :::{tab-item} nomic-ai/nomic-embed-text-v1.5
 :sync: nomic-embed-text-v1.5
 ```console
-python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --pooling MEAN --extra_quantization_params "--library sentence_transformers"
+python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --model_name nomic-embed-text-v1.5 --pooling MEAN --extra_quantization_params "--library sentence_transformers"
 ```
 :::
 :::{tab-item} sentence-transformers/all-mpnet-base-v2
 :sync: all-mpnet-base-v2
 ```console
-python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --pooling MEAN
+python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --model_name all-mpnet-base-v2 --pooling MEAN
 ```
 :::
 ::::
@@ -179,68 +179,68 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp
 :::{tab-item} BAAI/bge-large-en-v1.5
 :sync: bge-large-en-v1.5
 ```console
-python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --model_name bge-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} BAAI/bge-large-zh-v1.5
 :sync: bge-large-zh-v1.5
 ```console
-python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --model_name bge-large-zh-v1.5  --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} thenlper/gte-small
 :sync: gte-small
 ```console
-python export_model.py embeddings_ov --source_model thenlper/gte-small --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model thenlper/gte-small --model_name gte-small --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} sentence-transformers/all-MiniLM-L12-v2
 :sync: all-MiniLM-L12-v2
 ```console
-python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --model_name all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} sentence-transformers/all-distilroberta-v1
 :sync: all-distilroberta-v1
 ```console
-python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --model_name all-distilroberta-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} mixedbread-ai/deepset-mxbai-embed-de-large-v1
 :sync: deepset-mxbai-embed-de-large-v1
 ```console
-python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --model_name deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} intfloat/multilingual-e5-large-instruct
 :sync: multilingual-e5-large-instruc
 ```console
-python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --model_name multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} intfloat/multilingual-e5-large
 :sync: multilingual-e5-large
 ```console
-python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --model_name multilingual-e5-large --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} Alibaba-NLP/gte-large-en-v1.5
 :sync: gte-large-en-v1.5
 ```text
 pip install "transformers<5"  # WA for optimum-intel and model support for new transformers
-python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --model_name gte-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} nomic-ai/nomic-embed-text-v1.5
 :sync: nomic-embed-text-v1.5
 ```console
-python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --pooling MEAN --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --model_name nomic-embed-text-v1.5 --pooling MEAN --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models
 ```
 :::
 :::{tab-item} sentence-transformers/all-mpnet-base-v2
 :sync: all-mpnet-base-v2
 ```console
-python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
+python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --model_name all-mpnet-base-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models
 ```
 :::
 ::::
@@ -294,7 +294,7 @@ For example:
 
 > **Note:** By default OVMS returns first token embeddings as sequence embeddings (called CLS pooling). It can be changed using `--pooling` option if needed by the model. Supported values are CLS, MEAN and LAST. For example:
 ```console
-python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --pooling LAST --config_file_path models/config.json
+python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --pooling LAST --model_name Qwen3-Embedding-0.6B --config_file_path models/config.json
 ```
 
 ## Tested models
@@ -361,18 +361,18 @@ ovms --rest_port 8000 --config_path ./models/config.json
 
 ### Readiness Check
 
-Wait for the model to load. You can check the status with a simple command below. Note that the slash `/` in the model name needs to be escaped with `%2F`:
+Wait for the model to load. You can check the status with a simple command below:
 ```bash
-curl http://localhost:8000/v3/models/BAAI%2Fbge-large-en-v1.5
+curl http://localhost:8000/v3/models/bge-large-en-v1.5
 
-{"id":"BAAI/bge-large-en-v1.5","object":"model","created":1763997378,"owned_by":"OVMS"}
+{"id":"bge-large-en-v1.5","object":"model","created":1763997378,"owned_by":"OVMS"}
 ```
 
 ## Client code
 
 :::{dropdown} **Request embeddings with cURL**
 ```bash
-curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-large-en-v1.5\", \"input\": \"hello world\"}"
+curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"bge-large-en-v1.5\", \"input\": \"hello world\"}"
 ```
 ```json
 {
@@ -414,7 +414,7 @@ client = OpenAI(
   base_url="http://localhost:8000/v3",
   api_key="unused"
 )
-model = "BAAI/bge-large-en-v1.5"
+model = "bge-large-en-v1.5"
 embedding_responses = client.embeddings.create(
     input=[
         "That is a happy person",
@@ -441,7 +441,7 @@ git clone https://github.com/openvinotoolkit/model_server
 pushd .
 cd model_server/demos/benchmark/v3/
 pip install -r requirements.txt
-python benchmark.py --api_url http://localhost:8000/v3/embeddings --dataset synthetic --synthetic_length 5 --request_rate 10 --batch_size 1 --model BAAI/bge-large-en-v1.5
+python benchmark.py --api_url http://localhost:8000/v3/embeddings --dataset synthetic --synthetic_length 5 --request_rate 10 --batch_size 1 --model bge-large-en-v1.5 --tokenizer BAAI/bge-large-en-v1.5 
 Number of documents: 1000
 100%|██████████████████████████████████████| 1000/1000 [01:40<00:00,  9.92it/s]
 Tokens: 5000
@@ -452,7 +452,7 @@ Median latency: 12.43 ms
 Average document length: 5.0 tokens
 
 
-python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 32 --dataset synthetic --synthetic_length 510 --model BAAI/bge-large-en-v1.5
+python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 32 --dataset synthetic --synthetic_length 510 --model bge-large-en-v1.5 --tokenizer BAAI/bge-large-en-v1.5 
 Number of documents: 1000
 100%|██████████████████████████████████████| 32/32 [00:13<00:00,  2.43it/s]
 Tokens: 510000
@@ -463,7 +463,7 @@ Median latency: 6790.46 ms
 Average document length: 510.0 tokens
 
 
-python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 1 --dataset Cohere/wikipedia-2023-11-embed-multilingual-v3 --hf-subset simple --model BAAI/bge-large-en-v1.5
+python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 1 --dataset Cohere/wikipedia-2023-11-embed-multilingual-v3 --hf-subset simple --model bge-large-en-v1.5 --tokenizer BAAI/bge-large-en-v1.5
 Number of documents: 1000
 100%|██████████████████████████████████████| 1000/1000 [00:11<00:00, 89.84it/s]
 Tokens: 66568
@@ -488,7 +488,7 @@ The script [compare_results.py](https://raw.githubusercontent.com/openvinotoolki
 ```bash
 popd
 cd model_server/demos/embeddings
-python compare_results.py --model BAAI/bge-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings --pooling CLS --input "hello world" --input "goodbye world"
+python compare_results.py --model_name bge-large-en-v1.5 --hf_model_name BAAI/bge-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings --pooling CLS --input "hello world" --input "goodbye world"
 
 input ['hello world', 'goodbye world']
 HF Duration: 93.921 ms BertModel
@@ -518,75 +518,75 @@ Difference score with HF AutoModel: 0.020293646680283224
 
 It is easy also to run model evaluation using [MTEB](https://github.com/embeddings-benchmark/mteb) framework using a custom class based on openai model:
 ```console
-pip install "mteb<2" einops openai --extra-index-url "https://download.pytorch.org/whl/cpu"
+pip install "mteb tiktoken einops openai" --extra-index-url "https://download.pytorch.org/whl/cpu"
 curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/embeddings/ovms_mteb.py -o ovms_mteb.py
-python ovms_mteb.py --model BAAI/bge-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings
+python ovms_mteb.py --model_name ovms-model --service_url http://localhost:8000/v3
 ```
 Results will be stored in `results` folder:
 ```json
 {
   "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
   "task_name": "Banking77Classification",
-  "mteb_version": "1.39.7",
+  "mteb_version": "2.12.30",
   "scores": {
     "test": [
       {
-        "accuracy": 0.848636,
-        "f1": 0.842405,
-        "f1_weighted": 0.842405,
         "scores_per_experiment": [
           {
-            "accuracy": 0.842532,
-            "f1": 0.835091,
-            "f1_weighted": 0.835091
+            "accuracy": 0.835065,
+            "f1": 0.826769,
+            "f1_weighted": 0.826769
           },
           {
-            "accuracy": 0.851299,
-            "f1": 0.844622,
-            "f1_weighted": 0.844622
+            "accuracy": 0.841883,
+            "f1": 0.834859,
+            "f1_weighted": 0.834859
           },
           {
-            "accuracy": 0.849026,
-            "f1": 0.842238,
-            "f1_weighted": 0.842238
+            "accuracy": 0.841234,
+            "f1": 0.833533,
+            "f1_weighted": 0.833533
           },
           {
-            "accuracy": 0.853571,
-            "f1": 0.849815,
-            "f1_weighted": 0.849815
+            "accuracy": 0.843182,
+            "f1": 0.838652,
+            "f1_weighted": 0.838652
           },
           {
-            "accuracy": 0.846104,
-            "f1": 0.839,
-            "f1_weighted": 0.839
+            "accuracy": 0.839935,
+            "f1": 0.833928,
+            "f1_weighted": 0.833928
           },
           {
-            "accuracy": 0.849675,
-            "f1": 0.844259,
-            "f1_weighted": 0.844259
+            "accuracy": 0.84513,
+            "f1": 0.840801,
+            "f1_weighted": 0.840801
           },
           {
-            "accuracy": 0.846104,
-            "f1": 0.840343,
-            "f1_weighted": 0.840343
+            "accuracy": 0.836688,
+            "f1": 0.83106,
+            "f1_weighted": 0.83106
           },
           {
-            "accuracy": 0.846753,
-            "f1": 0.8397,
-            "f1_weighted": 0.8397
+            "accuracy": 0.836039,
+            "f1": 0.828459,
+            "f1_weighted": 0.828459
           },
           {
-            "accuracy": 0.853571,
-            "f1": 0.848239,
-            "f1_weighted": 0.848239
+            "accuracy": 0.842532,
+            "f1": 0.834954,
+            "f1_weighted": 0.834954
           },
           {
-            "accuracy": 0.847727,
-            "f1": 0.84074,
-            "f1_weighted": 0.84074
+            "accuracy": 0.833766,
+            "f1": 0.827428,
+            "f1_weighted": 0.827428
           }
         ],
-        "main_score": 0.848636,
+        "accuracy": 0.839545,
+        "f1": 0.833044,
+        "f1_weighted": 0.833044,
+        "main_score": 0.839545,
         "hf_subset": "default",
         "languages": [
           "eng-Latn"
@@ -594,7 +594,7 @@ Results will be stored in `results` folder:
       }
     ]
   },
-  "evaluation_time": 3841.1886789798737,
+  "evaluation_time": 484.35,
   "kg_co2_emissions": null
 }
 ```
diff --git a/demos/embeddings/ovms_mteb.py b/demos/embeddings/ovms_mteb.py
index ce1bb7af70..aa21df5888 100644
--- a/demos/embeddings/ovms_mteb.py
+++ b/demos/embeddings/ovms_mteb.py
@@ -16,65 +16,44 @@
 
 from __future__ import annotations
 
-import logging
-from functools import partial
-from typing import Any
+import argparse
 
-import numpy as np
 import mteb
-logger = logging.getLogger(__name__)
-import argparse
+from mteb.models.model_implementations.openai_models import OpenAIModel
+from openai import OpenAI
 
-parser = argparse.ArgumentParser(description='Compare embeddings responses from HF transformers, OVSentenceTransformer and OVMS')
-parser.add_argument('--service_url', required=False, default='http://localhost:6000/v3/embeddings',
+parser = argparse.ArgumentParser(description='Run MTEB benchmark against OVMS embeddings endpoint')
+parser.add_argument('--service_url', required=False, default='http://localhost:8000/v3/embeddings',
                     help='Specify url to embeddings endpoint. default:http://localhost:8000/v3/embeddings', dest='service_url')
 parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5',
                     dest='model_name')
 parser.add_argument('--dataset', default='Banking77Classification', help='Dataset to benchmark. default: Banking77Classification',
                     dest='dataset')
+parser.add_argument('--embed_dim', type=int, default=None, help='Embedding dimension. Auto-detected if not provided.',
+                    dest='embed_dim')
+parser.add_argument('--max_tokens', type=int, default=999999, help='Max input tokens for truncation. default: 512',
+                    dest='max_tokens')
 args = vars(parser.parse_args())
 
+client = OpenAI(base_url=args['service_url'], api_key="unused")
 
-class OVMSModel:
-    def __init__(self, model_name: str, base_url:str, embed_dim: int | None = None, **kwargs) -> None:
-        from openai import OpenAI
-
-        self._client = OpenAI(base_url=base_url,api_key="unused")
-        self._model_name = model_name
-        self._embed_dim = embed_dim
-
-    def encode(
-        self, sentences: list[str], **kwargs: Any
-    ) -> torch.Tensor | np.ndarray:
-        max_batch_size = 32
-        sublists = [
-            sentences[i : i + max_batch_size]
-            for i in range(0, len(sentences), max_batch_size)
-        ]
-        all_embeddings = []
-        for sublist in sublists:
-            response = self._client.embeddings.create(
-                input=sublist,
-                model=self._model_name,
-                encoding_format="float",
-                dimensions=self._embed_dim or NotGiven(),
-            )
-            all_embeddings.extend(self._to_numpy(response))
-
-        return np.array(all_embeddings)
-    def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
-        return self.encode(queries, **kwargs)
-
+embed_dim = args['embed_dim']
+if embed_dim is None:
+    resp = client.embeddings.create(input=['dim probe'], model=args['model_name'])
+    embed_dim = len(resp.data[0].embedding)
 
-    def _to_numpy(self, embedding_response) -> np.ndarray:
-        return np.array([e.embedding for e in embedding_response.data])
+model = OpenAIModel(
+    model_name=args['model_name'],
+    max_tokens=args['max_tokens'],
+    embed_dim=embed_dim,
+    client=client,
+)
 
-model = OVMSModel(args['model_name'], args['service_url'] ,1)
 tasks = mteb.get_task(args['dataset'])
 evaluation = mteb.MTEB(tasks=[tasks])
-evaluation.run(model,verbosity=3,overwrite_results=True,output_folder='results')
+evaluation.run(model, verbosity=3, overwrite_results=True, output_folder='results')
 # For full leaderboard tests set run:
 # benchmark = mteb.get_benchmark("MTEB(eng)")
 # evaluation = mteb.MTEB(tasks=benchmark)
-# evaluation.run(model,verbosity=3,overwrite_results=True,output_folder='results')
+# evaluation.run(model, verbosity=3, overwrite_results=True, output_folder='results')