From 59fa97a7256038530cdd56924707701dd9b1fb82 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 13 May 2026 17:06:36 +0200 Subject: [PATCH 1/6] added einops for embedding models and simplified accuracy description --- demos/common/export_models/README.md | 58 +------------------- demos/common/export_models/requirements.txt | 1 + demos/continuous_batching/accuracy/README.md | 36 ++++-------- 3 files changed, 14 insertions(+), 81 deletions(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 7f1276ff9e..68102cc321 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -40,62 +40,10 @@ For every use case subcommand there is adjusted list of parameters: ```console python export_model.py text_generation --help ``` -Expected Output: -```console -usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR] - [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS] - [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3_mode] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}] - [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}] [--enable_tool_guided_generation] -options: - -h, --help show this help message and exit - --model_repository_path MODEL_REPOSITORY_PATH - Where the model should be exported to - --source_model SOURCE_MODEL - HF model name or path to the local folder with PyTorch or OpenVINO model - --model_name MODEL_NAME - Model name that should be used in the deployment. Equal to source_model if HF model name is used - --weight-format PRECISION - precision of the exported model - --config_file_path CONFIG_FILE_PATH - path to the config file - --overwrite_models Overwrite the model if it already exists in the models repository - --target_device TARGET_DEVICE - CPU, GPU, NPU or HETERO, default is CPU - --ov_cache_dir OV_CACHE_DIR - Folder path for compilation cache to speedup initialization time - --extra_quantization_params EXTRA_QUANTIZATION_PARAMS - Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2" - --pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO} - Type of the pipeline to be used. AUTO is used by default - --kv_cache_precision {u8} - u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption. - --enable_prefix_caching ENABLE_PREFIX_CACHING - This algorithm is used to cache the prompt tokens. Default is True. - --disable_dynamic_split_fuse - The maximum number of tokens that can be batched together. - --max_num_batched_tokens MAX_NUM_BATCHED_TOKENS - empty or integer. The maximum number of tokens that can be batched together. - --max_num_seqs MAX_NUM_SEQS - 256 by default. The maximum number of sequences that can be processed together. - --cache_size CACHE_SIZE - KV cache size in GB. If not set, cache is allocated dynamically. - --draft_source_model DRAFT_SOURCE_MODEL - HF model name or path to the local folder with PyTorch or OpenVINO draft model. Using this option will create configuration for speculative decoding - --draft_model_name DRAFT_MODEL_NAME - Draft model name that should be used in the deployment. Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified. - --draft_eagle3_mode Set this flag if you use EAGLE3 draft model for speculative decoding - --max_prompt_len MAX_PROMPT_LEN - Sets NPU specific property for maximum number of tokens in the prompt. Not effective if target device is not NPU - --prompt_lookup_decoding - Set pipeline to use prompt lookup decoding - --reasoning_parser {qwen3,gptoss} - Set the type of the reasoning parser for reasoning content extraction - --tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2} - Set the type of the tool parser for tool calls extraction - --enable_tool_guided_generation - Enables enforcing tool schema during generation. Requires setting tool_parser -``` +> Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==` + + ## Model Export Examples diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt index 60cbf3d2fc..e016048a95 100644 --- a/demos/common/export_models/requirements.txt +++ b/demos/common/export_models/requirements.txt @@ -14,3 +14,4 @@ sentence_transformers==5.3.0 sentencepiece # Required by: transformers` torchvision requests +einops diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md index e67d2b98af..1cde283bc0 100644 --- a/demos/continuous_batching/accuracy/README.md +++ b/demos/continuous_batching/accuracy/README.md @@ -14,33 +14,17 @@ Install the framework via pip: pip3 install --extra-index-url "https://download.pytorch.org/whl/cpu" lm_eval[api] langdetect immutabledict dotenv openai ``` -## Exporting the models -```bash -git clone https://github.com/openvinotoolkit/model_server.git -cd model_server -pip3 install -U -r demos/common/export_models/requirements.txt -mkdir models -python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3.1-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models -python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3.1-8B --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models -python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models -python demos/common/export_models/export_model.py text_generation --source_model Qwen/Qwen3-8B --model_name openvino-qwen3-8b-int8 --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser hermes3 --overwrite_models -``` - ## Starting the model server -### With Docker -```bash -docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest --rest_port 8000 --config_path /workspace/config.json -``` -### On Baremetal -```bash -ovms --rest_port 8000 --config_path ./models/config.json -``` +Example of LLM and VLM models deployment is documented in other demos like +[Agentic usage for LLM models](../agentic_ai/README.md) +[Using VLM models](../vlm/README.md) + ## Running the tests for LLM models -```bash +```text lm-eval --model local-chat-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=1,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --apply_chat_template --limit 100 local-chat-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'base_url': 'http://localhost:8000/v3/chat/completions', 'num_concurrent': 10, 'max_retries': 3, 'tokenized_requests': False}), gen_kwargs: ({}), limit: 100.0, num_fewshot: None, batch_size: 1 @@ -52,7 +36,7 @@ local-chat-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'base While testing the non chat model and `completion` endpoint, the command would look like this: -```bash +```text lm-eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B,base_url=http://localhost:8000/v3/completions,num_concurrent=1,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path results/ --seed 1 --limit 100 local-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B', 'base_url': 'http://localhost:8000/v3/completions', 'num_concurrent': 10, 'max_retries': 3, 'tokenized_requests': False}), gen_kwargs: ({}), limit: 100.0, num_fewshot: None, batch_size: 1 @@ -64,11 +48,11 @@ local-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B', 'base_url': 'http:/ Other examples are below: -```bash +```text lm-eval --model local-chat-completions --tasks leaderboard_ifeval --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=10,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --limit 100 --apply_chat_template ``` -```bash +```text lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama/Meta-Llama-3.1-8B,base_url=http://localhost:8000/v3/completions,num_concurrent=10,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --limit 100 ``` @@ -76,7 +60,7 @@ lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama Use [lmms-eval project](https://github.com/EvolvingLMMs-Lab/lmms-eval) - mme and mmmu_val tasks. -```bash +```text export OPENAI_BASE_URL=http://localhost:8000/v3 export OPENAI_API_KEY="unused" git clone https://github.com/EvolvingLMMs-Lab/lmms-eval @@ -85,7 +69,7 @@ git checkout 88b23e2bfa16a1edbc16e9e238ed82130b3a4f56 pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu" python -m lmms_eval \ --model openai_compatible \ - --model_args model_version=OpenGVLab/InternVL2_5-8B,max_retries=1 \ + --model_args model_version=OpenVINO/InternVL2_5-8B_int4-ov,max_retries=1 \ --tasks mme,mmmu_val \ --batch_size 1 \ --log_samples \ From 4fb1e5b26bf36f659cbf1caa632fbc1acedc4581 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 19 May 2026 11:03:26 +0200 Subject: [PATCH 2/6] review fixes --- demos/continuous_batching/accuracy/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md index 1cde283bc0..fd6c9a44a0 100644 --- a/demos/continuous_batching/accuracy/README.md +++ b/demos/continuous_batching/accuracy/README.md @@ -10,7 +10,7 @@ It reports end to end quality of served model from the client application point ## Preparing the lm-evaluation-harness framework Install the framework via pip: -```bash +```text pip3 install --extra-index-url "https://download.pytorch.org/whl/cpu" lm_eval[api] langdetect immutabledict dotenv openai ``` @@ -65,11 +65,10 @@ export OPENAI_BASE_URL=http://localhost:8000/v3 export OPENAI_API_KEY="unused" git clone https://github.com/EvolvingLMMs-Lab/lmms-eval cd lmms-eval -git checkout 88b23e2bfa16a1edbc16e9e238ed82130b3a4f56 pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu" python -m lmms_eval \ --model openai_compatible \ - --model_args model_version=OpenVINO/InternVL2_5-8B_int4-ov,max_retries=1 \ + --model_args model_version=OpenVINO/InternVL2-8B_int4-ov,max_retries=1 \ --tasks mme,mmmu_val \ --batch_size 1 \ --log_samples \ @@ -105,7 +104,7 @@ pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu" The commands below assumes the models is deployed with the name `ovms-model`. It must match the name set in the `bfcl_eval/constants/model_config.py`. ```text export OPENAI_BASE_URL=http://localhost:8000/v3 -export CHAT_TEMPLATE_KWARGS='{"enable_thinking":false, "reasoning_effort":"low"}' +export CHAT_TEMPLATE_KWARGS='{"enable_thinking":false, "reasoning_effort":"low", "preserve_reasoning":false}' bfcl generate --model ovms-model --test-category simple_python,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir bfcl evaluate --model ovms-model --result-dir model_name_dir @@ -114,7 +113,7 @@ bfcl evaluate --model ovms-model --result-dir model_name_dir Alternatively, use the model name `ovms-model-stream` to run the tests with stream requests. The results should be the same. ```text export OPENAI_BASE_URL=http://localhost:8000/v3 -bfcl generate --model ovms-model-stream --test-category simple_python,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir +bfcl generate --model ovms-model-stream --test-category simple_python,multiple,multi_turn_base --temperature 0.0 --num-threads 10 -o --result-dir model_name_dir bfcl evaluate --model ovms-model-stream --result-dir model_name_dir ``` @@ -122,7 +121,7 @@ bfcl evaluate --model ovms-model-stream --result-dir model_name_dir The output artifacts will be stored in `result` and `scores`. For example: ```text -cat score/openvino-qwen3-8b-int4-FC/BFCL_v3_simple_python_score.json | head -1 +cat score/openvino-qwen3-8b-int4-FC/BFCL_v4_simple_python_score.json | head -1 {"accuracy": 0.95, "correct_count": 380, "total_count": 400} ``` Those results can be compared with the reference from the [berkeley leaderbaord](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard). From f524f2f90d2f62a8580fdf154f4236c8cfc44357 Mon Sep 17 00:00:00 2001 From: "Trawinski, Dariusz" Date: Tue, 19 May 2026 11:06:37 +0200 Subject: [PATCH 3/6] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- demos/common/export_models/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 68102cc321..d0d30b0a22 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -41,7 +41,7 @@ For every use case subcommand there is adjusted list of parameters: python export_model.py text_generation --help ``` -> Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==` +> Note: Exporting some models might require different transformers version than specified in requirements.txt. Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==` From 9f6683bb49969b924129dbf0f3c1a54407cf330e Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 19 May 2026 13:52:34 +0200 Subject: [PATCH 4/6] update readme --- demos/common/export_models/README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 68102cc321..af2ca5c202 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -42,7 +42,12 @@ python export_model.py text_generation --help ``` > Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==` - +Some of the exceptions include: +- Alibaba-NLP/gte-large-en-v1.5 - `transformers<5.0` +- OpenGVLab/InternVL - `transformers<5.0` +- Qwen3-80B-Next and Qwen3-coder-next - `transformers<5.0` +- gemma4 - `transformers==5.5` +- Qwen3.5 and Qwen3.6 - `transformers==5.2` ## Model Export Examples @@ -100,7 +105,7 @@ python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B -- #### Embeddings with `sentence_transformers` library Some embedding models require special handling during export. For example: ```console -python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --extra_quantization_params "--library sentence_transformers" --weight-format fp16 --config_file_path models/config_all.json +python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --extra_quantization_params "--library sentence_transformers" --pooling MEAN --weight-format fp16 --config_file_path models/config_all.json ``` Known models that require it: - Alibaba-NLP/gte-large-en-v1.5 From 1b627f4f3237b6da614075e214ee785ecdc6d2fd Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 19 May 2026 14:03:52 +0200 Subject: [PATCH 5/6] exception and skip tests for gte model --- demos/embeddings/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 814e505515..2aa196ebea 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -154,7 +154,8 @@ python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-lar ::: :::{tab-item} Alibaba-NLP/gte-large-en-v1.5 :sync: gte-large-en-v1.5 -```console +```text +pip install "transformers<5" # WA for optimum-intel and model support for new transformers python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --extra_quantization_params "--library sentence_transformers" ``` ::: @@ -225,7 +226,8 @@ python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-lar ::: :::{tab-item} Alibaba-NLP/gte-large-en-v1.5 :sync: gte-large-en-v1.5 -```console +```text +pip install "transformers<5" # WA for optimum-intel and model support for new transformers python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models ``` ::: From 1b1eed274e7af549320a53b0ad02f265d50725ea Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 20 May 2026 11:45:50 +0200 Subject: [PATCH 6/6] update to latest mteb --- demos/embeddings/README.md | 152 +++++++++++++++++----------------- demos/embeddings/ovms_mteb.py | 65 +++++---------- 2 files changed, 98 insertions(+), 119 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 2aa196ebea..21da78ab03 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -24,7 +24,7 @@ This procedure can be used to pull preconfigured models from OpenVINO organizati ```bash mkdir -p models docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-Embedding-0.6B-int8-ov --pooling LAST --task embeddings -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov ``` **On Bare Metal (Windows/Linux)** @@ -40,14 +40,14 @@ ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwe ```bash docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --task embeddings -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov ``` **On Bare Metal (Windows/Linux)** ```console ovms --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --task embeddings -ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov +ovms --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov ``` ::: :::: @@ -60,7 +60,7 @@ ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/bge ```bash docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-Embedding-0.6B-int8-ov --pooling LAST --target_device GPU --task embeddings -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name Qwen3-Embedding-0.6B-int8-ov --model_path OpenVINO/Qwen3-Embedding-0.6B-int8-ov ``` **On Bare Metal (Windows/Linux)** @@ -76,14 +76,14 @@ ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwe ```bash docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --target_device GPU --task embeddings -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov ``` **On Bare Metal (Windows/Linux)** ```console ovms --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-int8-ov --pooling CLS --target_device GPU --task embeddings -ovms --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov +ovms --add_to_config --config_path /models/config.json --model_name bge-base-en-v1.5-int8-ov --model_path OpenVINO/bge-base-en-v1.5-int8-ov ``` ::: :::: @@ -107,68 +107,68 @@ Run `export_model.py` script to download and quantize the model: :::{tab-item} BAAI/bge-large-en-v1.5 :sync: bge-large-en-v1.5 ```console -python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --model_name bge-large-en-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} BAAI/bge-large-zh-v1.5 :sync: bge-large-zh-v1.5 ```console -python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --model_name bge-large-zh-v1.5 --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} thenlper/gte-small :sync: gte-small ```console -python export_model.py embeddings_ov --source_model thenlper/gte-small --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model thenlper/gte-small --model_name gte-small --pooling CLS --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} sentence-transformers/all-MiniLM-L12-v2 :sync: all-MiniLM-L12-v2 ```console -python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --model_name all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} sentence-transformers/all-distilroberta-v1 :sync: all-distilroberta-v1 ```console -python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --model_name all-distilroberta-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} mixedbread-ai/deepset-mxbai-embed-de-large-v1 :sync: deepset-mxbai-embed-de-large-v1 ```console -python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --model_name deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} intfloat/multilingual-e5-large-instruct :sync: multilingual-e5-large-instruc ```console -python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --model_name multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} intfloat/multilingual-e5-large :sync: multilingual-e5-large ```console -python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --model_name multilingual-e5-large --pooling MEAN --weight-format int8 --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} Alibaba-NLP/gte-large-en-v1.5 :sync: gte-large-en-v1.5 ```text pip install "transformers<5" # WA for optimum-intel and model support for new transformers -python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --extra_quantization_params "--library sentence_transformers" +python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --model_name gte-large-en-v1.5 --pooling CLS --extra_quantization_params "--library sentence_transformers" ``` ::: :::{tab-item} nomic-ai/nomic-embed-text-v1.5 :sync: nomic-embed-text-v1.5 ```console -python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --pooling MEAN --extra_quantization_params "--library sentence_transformers" +python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --model_name nomic-embed-text-v1.5 --pooling MEAN --extra_quantization_params "--library sentence_transformers" ``` ::: :::{tab-item} sentence-transformers/all-mpnet-base-v2 :sync: all-mpnet-base-v2 ```console -python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --pooling MEAN +python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --model_name all-mpnet-base-v2 --pooling MEAN ``` ::: :::: @@ -179,68 +179,68 @@ python export_model.py embeddings_ov --source_model sentence-transformers/all-mp :::{tab-item} BAAI/bge-large-en-v1.5 :sync: bge-large-en-v1.5 ```console -python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model BAAI/bge-large-en-v1.5 --model_name bge-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} BAAI/bge-large-zh-v1.5 :sync: bge-large-zh-v1.5 ```console -python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model BAAI/bge-large-zh-v1.5 --model_name bge-large-zh-v1.5 --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} thenlper/gte-small :sync: gte-small ```console -python export_model.py embeddings_ov --source_model thenlper/gte-small --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model thenlper/gte-small --model_name gte-small --pooling CLS --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} sentence-transformers/all-MiniLM-L12-v2 :sync: all-MiniLM-L12-v2 ```console -python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model sentence-transformers/all-MiniLM-L12-v2 --model_name all-MiniLM-L12-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} sentence-transformers/all-distilroberta-v1 :sync: all-distilroberta-v1 ```console -python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model sentence-transformers/all-distilroberta-v1 --model_name all-distilroberta-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} mixedbread-ai/deepset-mxbai-embed-de-large-v1 :sync: deepset-mxbai-embed-de-large-v1 ```console -python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model mixedbread-ai/deepset-mxbai-embed-de-large-v1 --model_name deepset-mxbai-embed-de-large-v1 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} intfloat/multilingual-e5-large-instruct :sync: multilingual-e5-large-instruc ```console -python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large-instruct --model_name multilingual-e5-large-instruct --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} intfloat/multilingual-e5-large :sync: multilingual-e5-large ```console -python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model intfloat/multilingual-e5-large --model_name multilingual-e5-large --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} Alibaba-NLP/gte-large-en-v1.5 :sync: gte-large-en-v1.5 ```text pip install "transformers<5" # WA for optimum-intel and model support for new transformers -python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --model_name gte-large-en-v1.5 --pooling CLS --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} nomic-ai/nomic-embed-text-v1.5 :sync: nomic-embed-text-v1.5 ```console -python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --pooling MEAN --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model nomic-ai/nomic-embed-text-v1.5 --model_name nomic-embed-text-v1.5 --pooling MEAN --weight-format int8 --target_device GPU --extra_quantization_params "--library sentence_transformers" --config_file_path models/config.json --model_repository_path models ``` ::: :::{tab-item} sentence-transformers/all-mpnet-base-v2 :sync: all-mpnet-base-v2 ```console -python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models +python export_model.py embeddings_ov --source_model sentence-transformers/all-mpnet-base-v2 --model_name all-mpnet-base-v2 --pooling MEAN --weight-format int8 --target_device GPU --config_file_path models/config.json --model_repository_path models ``` ::: :::: @@ -294,7 +294,7 @@ For example: > **Note:** By default OVMS returns first token embeddings as sequence embeddings (called CLS pooling). It can be changed using `--pooling` option if needed by the model. Supported values are CLS, MEAN and LAST. For example: ```console -python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --pooling LAST --config_file_path models/config.json +python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --pooling LAST --model_name Qwen3-Embedding-0.6B --config_file_path models/config.json ``` ## Tested models @@ -361,18 +361,18 @@ ovms --rest_port 8000 --config_path ./models/config.json ### Readiness Check -Wait for the model to load. You can check the status with a simple command below. Note that the slash `/` in the model name needs to be escaped with `%2F`: +Wait for the model to load. You can check the status with a simple command below: ```bash -curl http://localhost:8000/v3/models/BAAI%2Fbge-large-en-v1.5 +curl http://localhost:8000/v3/models/bge-large-en-v1.5 -{"id":"BAAI/bge-large-en-v1.5","object":"model","created":1763997378,"owned_by":"OVMS"} +{"id":"bge-large-en-v1.5","object":"model","created":1763997378,"owned_by":"OVMS"} ``` ## Client code :::{dropdown} **Request embeddings with cURL** ```bash -curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"BAAI/bge-large-en-v1.5\", \"input\": \"hello world\"}" +curl http://localhost:8000/v3/embeddings -H "Content-Type: application/json" -d "{ \"model\": \"bge-large-en-v1.5\", \"input\": \"hello world\"}" ``` ```json { @@ -414,7 +414,7 @@ client = OpenAI( base_url="http://localhost:8000/v3", api_key="unused" ) -model = "BAAI/bge-large-en-v1.5" +model = "bge-large-en-v1.5" embedding_responses = client.embeddings.create( input=[ "That is a happy person", @@ -441,7 +441,7 @@ git clone https://github.com/openvinotoolkit/model_server pushd . cd model_server/demos/benchmark/v3/ pip install -r requirements.txt -python benchmark.py --api_url http://localhost:8000/v3/embeddings --dataset synthetic --synthetic_length 5 --request_rate 10 --batch_size 1 --model BAAI/bge-large-en-v1.5 +python benchmark.py --api_url http://localhost:8000/v3/embeddings --dataset synthetic --synthetic_length 5 --request_rate 10 --batch_size 1 --model bge-large-en-v1.5 --tokenizer BAAI/bge-large-en-v1.5 Number of documents: 1000 100%|██████████████████████████████████████| 1000/1000 [01:40<00:00, 9.92it/s] Tokens: 5000 @@ -452,7 +452,7 @@ Median latency: 12.43 ms Average document length: 5.0 tokens -python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 32 --dataset synthetic --synthetic_length 510 --model BAAI/bge-large-en-v1.5 +python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 32 --dataset synthetic --synthetic_length 510 --model bge-large-en-v1.5 --tokenizer BAAI/bge-large-en-v1.5 Number of documents: 1000 100%|██████████████████████████████████████| 32/32 [00:13<00:00, 2.43it/s] Tokens: 510000 @@ -463,7 +463,7 @@ Median latency: 6790.46 ms Average document length: 510.0 tokens -python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 1 --dataset Cohere/wikipedia-2023-11-embed-multilingual-v3 --hf-subset simple --model BAAI/bge-large-en-v1.5 +python benchmark.py --api_url http://localhost:8000/v3/embeddings --request_rate inf --batch_size 1 --dataset Cohere/wikipedia-2023-11-embed-multilingual-v3 --hf-subset simple --model bge-large-en-v1.5 --tokenizer BAAI/bge-large-en-v1.5 Number of documents: 1000 100%|██████████████████████████████████████| 1000/1000 [00:11<00:00, 89.84it/s] Tokens: 66568 @@ -488,7 +488,7 @@ The script [compare_results.py](https://raw.githubusercontent.com/openvinotoolki ```bash popd cd model_server/demos/embeddings -python compare_results.py --model BAAI/bge-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings --pooling CLS --input "hello world" --input "goodbye world" +python compare_results.py --model_name bge-large-en-v1.5 --hf_model_name BAAI/bge-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings --pooling CLS --input "hello world" --input "goodbye world" input ['hello world', 'goodbye world'] HF Duration: 93.921 ms BertModel @@ -518,75 +518,75 @@ Difference score with HF AutoModel: 0.020293646680283224 It is easy also to run model evaluation using [MTEB](https://github.com/embeddings-benchmark/mteb) framework using a custom class based on openai model: ```console -pip install "mteb<2" einops openai --extra-index-url "https://download.pytorch.org/whl/cpu" +pip install "mteb tiktoken einops openai" --extra-index-url "https://download.pytorch.org/whl/cpu" curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/embeddings/ovms_mteb.py -o ovms_mteb.py -python ovms_mteb.py --model BAAI/bge-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings +python ovms_mteb.py --model_name ovms-model --service_url http://localhost:8000/v3 ``` Results will be stored in `results` folder: ```json { "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300", "task_name": "Banking77Classification", - "mteb_version": "1.39.7", + "mteb_version": "2.12.30", "scores": { "test": [ { - "accuracy": 0.848636, - "f1": 0.842405, - "f1_weighted": 0.842405, "scores_per_experiment": [ { - "accuracy": 0.842532, - "f1": 0.835091, - "f1_weighted": 0.835091 + "accuracy": 0.835065, + "f1": 0.826769, + "f1_weighted": 0.826769 }, { - "accuracy": 0.851299, - "f1": 0.844622, - "f1_weighted": 0.844622 + "accuracy": 0.841883, + "f1": 0.834859, + "f1_weighted": 0.834859 }, { - "accuracy": 0.849026, - "f1": 0.842238, - "f1_weighted": 0.842238 + "accuracy": 0.841234, + "f1": 0.833533, + "f1_weighted": 0.833533 }, { - "accuracy": 0.853571, - "f1": 0.849815, - "f1_weighted": 0.849815 + "accuracy": 0.843182, + "f1": 0.838652, + "f1_weighted": 0.838652 }, { - "accuracy": 0.846104, - "f1": 0.839, - "f1_weighted": 0.839 + "accuracy": 0.839935, + "f1": 0.833928, + "f1_weighted": 0.833928 }, { - "accuracy": 0.849675, - "f1": 0.844259, - "f1_weighted": 0.844259 + "accuracy": 0.84513, + "f1": 0.840801, + "f1_weighted": 0.840801 }, { - "accuracy": 0.846104, - "f1": 0.840343, - "f1_weighted": 0.840343 + "accuracy": 0.836688, + "f1": 0.83106, + "f1_weighted": 0.83106 }, { - "accuracy": 0.846753, - "f1": 0.8397, - "f1_weighted": 0.8397 + "accuracy": 0.836039, + "f1": 0.828459, + "f1_weighted": 0.828459 }, { - "accuracy": 0.853571, - "f1": 0.848239, - "f1_weighted": 0.848239 + "accuracy": 0.842532, + "f1": 0.834954, + "f1_weighted": 0.834954 }, { - "accuracy": 0.847727, - "f1": 0.84074, - "f1_weighted": 0.84074 + "accuracy": 0.833766, + "f1": 0.827428, + "f1_weighted": 0.827428 } ], - "main_score": 0.848636, + "accuracy": 0.839545, + "f1": 0.833044, + "f1_weighted": 0.833044, + "main_score": 0.839545, "hf_subset": "default", "languages": [ "eng-Latn" @@ -594,7 +594,7 @@ Results will be stored in `results` folder: } ] }, - "evaluation_time": 3841.1886789798737, + "evaluation_time": 484.35, "kg_co2_emissions": null } ``` diff --git a/demos/embeddings/ovms_mteb.py b/demos/embeddings/ovms_mteb.py index ce1bb7af70..aa21df5888 100644 --- a/demos/embeddings/ovms_mteb.py +++ b/demos/embeddings/ovms_mteb.py @@ -16,65 +16,44 @@ from __future__ import annotations -import logging -from functools import partial -from typing import Any +import argparse -import numpy as np import mteb -logger = logging.getLogger(__name__) -import argparse +from mteb.models.model_implementations.openai_models import OpenAIModel +from openai import OpenAI -parser = argparse.ArgumentParser(description='Compare embeddings responses from HF transformers, OVSentenceTransformer and OVMS') -parser.add_argument('--service_url', required=False, default='http://localhost:6000/v3/embeddings', +parser = argparse.ArgumentParser(description='Run MTEB benchmark against OVMS embeddings endpoint') +parser.add_argument('--service_url', required=False, default='http://localhost:8000/v3/embeddings', help='Specify url to embeddings endpoint. default:http://localhost:8000/v3/embeddings', dest='service_url') parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5', dest='model_name') parser.add_argument('--dataset', default='Banking77Classification', help='Dataset to benchmark. default: Banking77Classification', dest='dataset') +parser.add_argument('--embed_dim', type=int, default=None, help='Embedding dimension. Auto-detected if not provided.', + dest='embed_dim') +parser.add_argument('--max_tokens', type=int, default=999999, help='Max input tokens for truncation. default: 512', + dest='max_tokens') args = vars(parser.parse_args()) +client = OpenAI(base_url=args['service_url'], api_key="unused") -class OVMSModel: - def __init__(self, model_name: str, base_url:str, embed_dim: int | None = None, **kwargs) -> None: - from openai import OpenAI - - self._client = OpenAI(base_url=base_url,api_key="unused") - self._model_name = model_name - self._embed_dim = embed_dim - - def encode( - self, sentences: list[str], **kwargs: Any - ) -> torch.Tensor | np.ndarray: - max_batch_size = 32 - sublists = [ - sentences[i : i + max_batch_size] - for i in range(0, len(sentences), max_batch_size) - ] - all_embeddings = [] - for sublist in sublists: - response = self._client.embeddings.create( - input=sublist, - model=self._model_name, - encoding_format="float", - dimensions=self._embed_dim or NotGiven(), - ) - all_embeddings.extend(self._to_numpy(response)) - - return np.array(all_embeddings) - def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: - return self.encode(queries, **kwargs) - +embed_dim = args['embed_dim'] +if embed_dim is None: + resp = client.embeddings.create(input=['dim probe'], model=args['model_name']) + embed_dim = len(resp.data[0].embedding) - def _to_numpy(self, embedding_response) -> np.ndarray: - return np.array([e.embedding for e in embedding_response.data]) +model = OpenAIModel( + model_name=args['model_name'], + max_tokens=args['max_tokens'], + embed_dim=embed_dim, + client=client, +) -model = OVMSModel(args['model_name'], args['service_url'] ,1) tasks = mteb.get_task(args['dataset']) evaluation = mteb.MTEB(tasks=[tasks]) -evaluation.run(model,verbosity=3,overwrite_results=True,output_folder='results') +evaluation.run(model, verbosity=3, overwrite_results=True, output_folder='results') # For full leaderboard tests set run: # benchmark = mteb.get_benchmark("MTEB(eng)") # evaluation = mteb.MTEB(tasks=benchmark) -# evaluation.run(model,verbosity=3,overwrite_results=True,output_folder='results') +# evaluation.run(model, verbosity=3, overwrite_results=True, output_folder='results')