[update] Reduce model loading time. Optimize model loading method

LittleMouse · LittleMouse · commit 50e36093534a · 2025-12-18T15:32:37.000+08:00
diff --git a/projects/llm_framework/main_cosy_voice/src/main.cpp b/projects/llm_framework/main_cosy_voice/src/main.cpp
@@ -244,7 +244,7 @@ class llm_task {
                     SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
                           (base_model + std::string("tokenizer")).c_str(), prompt_.c_str());
 
-                    std::this_thread::sleep_for(std::chrono::seconds(15));
+                    std::this_thread::sleep_for(std::chrono::seconds(5));
                 };
 
                 auto process_field = [&](std::string &field, const char *name_for_log) -> bool {
diff --git a/projects/llm_framework/main_cosy_voice/src/runner/LLM.hpp b/projects/llm_framework/main_cosy_voice/src/runner/LLM.hpp
@@ -149,7 +149,7 @@ class LLM {
             llama_layers[i].filename = axmodel_path;
 
             if (!attr.b_dynamic_load_axmodel_layer) {
-                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
+                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
                 if (ret != 0) {
                     ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
                     return false;
@@ -172,12 +172,12 @@ class LLM {
             }
         }
 
-        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
+        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
             return false;
         }
-        ret = llm_decoder.init(attr.filename_decoder_axmodel.c_str(), false);
+        ret = llm_decoder.init(attr.filename_decoder_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init llm decoder axmodel(%s) failed", attr.filename_decoder_axmodel.c_str());
             return false;
diff --git a/projects/llm_framework/main_llm/src/runner/LLM.hpp b/projects/llm_framework/main_llm/src/runner/LLM.hpp
@@ -139,7 +139,7 @@ class LLM {
             llama_layers[i].filename = axmodel_path;
 
             if (!attr.b_dynamic_load_axmodel_layer) {
-                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
+                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
                 if (ret != 0) {
                     ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
                     return false;
@@ -162,7 +162,7 @@ class LLM {
             }
         }
 
-        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
+        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
             return false;
@@ -602,7 +602,7 @@ class LLM_CTX {
             sprintf(axmodel_path, attr.template_filename_axmodel.c_str(), i);
             llama_layers[i].filename = axmodel_path;
 
-            int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
+            int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
             if (ret != 0) {
                 ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
                 return false;
@@ -612,7 +612,7 @@ class LLM_CTX {
             update_cqdm(&cqdm, i + 2, "count", axmodel_path);
         }
 
-        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
+        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
             return false;
@@ -810,9 +810,6 @@ class LLM_CTX {
 
                 layer.layer.inference(prefill_grpid);
 
-                auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
-                auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
-
                 auto &input_prefill_k_cache = layer.layer.get_input(prefill_grpid, "K_cache");
                 auto &input_prefill_v_cache = layer.layer.get_input(prefill_grpid, "V_cache");
 
@@ -821,12 +818,6 @@ class LLM_CTX {
 
                 int kv_offset = (p * _attr.prefill_token_num) * _attr.kv_cache_size;
 
-                memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
-                       sizeof(unsigned short) * _attr.prefill_token_num * _attr.kv_cache_size);
-
-                memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
-                       sizeof(unsigned short) * _attr.prefill_token_num * _attr.kv_cache_size);
-
                 memcpy((unsigned short *)input_prefill_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
                        sizeof(unsigned short) * _attr.prefill_token_num * _attr.kv_cache_size);
 
diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp
@@ -263,7 +263,7 @@ class llm_task {
                     SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
                           (base_model + std::string("tokenizer")).c_str(), prompt_.c_str());
 
-                    std::this_thread::sleep_for(std::chrono::seconds(15));
+                    std::this_thread::sleep_for(std::chrono::seconds(5));
                 };
 
                 auto process_field = [&](std::string &field, const char *name_for_log) -> bool {
@@ -291,7 +291,7 @@ class llm_task {
                     model_type_ = ModelType::Qwen;
                 else if (encoder_name.find("internvl3") != std::string::npos && mode_config_.precompute_len > 0)
                     model_type_ = ModelType::InternVL_CTX;
-                else if (encoder_name.find("internvl3") != std::string::npos)
+                else if ((encoder_name.find("internvl3") != std::string::npos) || (encoder_name.find("vpm") != std::string::npos))
                     model_type_ = ModelType::InternVL;
                 else
                     model_type_ = ModelType::Unknown;
diff --git a/projects/llm_framework/main_vlm/src/runner/LLM.hpp b/projects/llm_framework/main_vlm/src/runner/LLM.hpp
@@ -155,7 +155,7 @@ class LLM {
             llama_layers[i].filename = axmodel_path;
 
             if (!attr.b_dynamic_load_axmodel_layer) {
-                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
+                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
                 if (ret != 0) {
                     ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
                     return false;
@@ -178,7 +178,7 @@ class LLM {
             }
         }
 
-        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
+        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
             return false;
@@ -188,13 +188,13 @@ class LLM {
         update_cqdm(&cqdm, attr.axmodel_num + 2, "count", axmodel_path);
 
         if (_attr.b_vpm_two_stage) {
-            ret = vpm_encoder.init(attr.filename_vpm_encoder_axmodel.c_str(), false);
+            ret = vpm_encoder.init(attr.filename_vpm_encoder_axmodel.c_str(), true);
             if (ret != 0) {
                 ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_encoder_axmodel.c_str());
                 return false;
             }
 
-            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), false);
+            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), true);
             if (ret != 0) {
                 ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodedl.c_str());
                 return false;
@@ -203,7 +203,7 @@ class LLM {
             _attr.vpm_height = vpm_encoder.get_input(0).vShape[1];
             _attr.vpm_width  = vpm_encoder.get_input(0).vShape[2];
         } else {
-            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), false);
+            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), true);
             if (ret != 0) {
                 ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodedl.c_str());
                 return false;
@@ -716,7 +716,7 @@ class LLM_CTX {
             sprintf(axmodel_path, attr.template_filename_axmodel.c_str(), i);
             llama_layers[i].filename = axmodel_path;
 
-            int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
+            int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
             if (ret != 0) {
                 ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
                 return false;
@@ -726,7 +726,7 @@ class LLM_CTX {
             update_cqdm(&cqdm, i + 2, "count", axmodel_path);
         }
 
-        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
+        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
             return false;
@@ -1773,7 +1773,7 @@ class LLM_Qwen {
             llama_layers[i].filename = axmodel_path;
 
             if (!attr.b_dynamic_load_axmodel_layer) {
-                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
+                int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
                 if (ret != 0) {
                     ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
                     return false;
@@ -1796,7 +1796,7 @@ class LLM_Qwen {
             }
         }
 
-        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
+        int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
             return false;
@@ -1805,7 +1805,7 @@ class LLM_Qwen {
         sprintf(axmodel_path, "init post axmodel ok,remain_cmm(%d MB)", remain_cmm);
         update_cqdm(&cqdm, attr.axmodel_num + 2, "count", axmodel_path);
 
-        ret = image_encoder.init(attr.filename_image_encoder_axmodel.c_str(), false);
+        ret = image_encoder.init(attr.filename_image_encoder_axmodel.c_str(), true);
         if (ret != 0) {
             ALOGE("init image_encoder axmodel(%s) failed", attr.filename_image_encoder_axmodel.c_str());
             return false;
@@ -2249,11 +2249,20 @@ class LLM_Qwen {
 
                 layer.layer.inference(_attr.prefill_grpid);
 
+                auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
+                auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
+
                 auto &output_k_cache = layer.layer.get_output(_attr.prefill_grpid, "K_cache_out");
                 auto &output_v_cache = layer.layer.get_output(_attr.prefill_grpid, "V_cache_out");
 
                 int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num) * _attr.kv_cache_size;
 
+                memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
+                       sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
+
+                memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
+                       sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
+
                 for (int gid = _attr.prefill_grpid + 1; gid < prefill_split_num + 1; gid++) {
                     auto &input_prefill_k_cache = layer.layer.get_input(gid, "K_cache");