Skip to content

Commit 50e3609

Browse files
author
LittleMouse
committed
[update] Reduce model loading time. Optimize model loading method
1 parent cde5921 commit 50e3609

File tree

5 files changed

+29
-29
lines changed

5 files changed

+29
-29
lines changed

projects/llm_framework/main_cosy_voice/src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ class llm_task {
244244
SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
245245
(base_model + std::string("tokenizer")).c_str(), prompt_.c_str());
246246

247-
std::this_thread::sleep_for(std::chrono::seconds(15));
247+
std::this_thread::sleep_for(std::chrono::seconds(5));
248248
};
249249

250250
auto process_field = [&](std::string &field, const char *name_for_log) -> bool {

projects/llm_framework/main_cosy_voice/src/runner/LLM.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ class LLM {
149149
llama_layers[i].filename = axmodel_path;
150150

151151
if (!attr.b_dynamic_load_axmodel_layer) {
152-
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
152+
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
153153
if (ret != 0) {
154154
ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
155155
return false;
@@ -172,12 +172,12 @@ class LLM {
172172
}
173173
}
174174

175-
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
175+
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
176176
if (ret != 0) {
177177
ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
178178
return false;
179179
}
180-
ret = llm_decoder.init(attr.filename_decoder_axmodel.c_str(), false);
180+
ret = llm_decoder.init(attr.filename_decoder_axmodel.c_str(), true);
181181
if (ret != 0) {
182182
ALOGE("init llm decoder axmodel(%s) failed", attr.filename_decoder_axmodel.c_str());
183183
return false;

projects/llm_framework/main_llm/src/runner/LLM.hpp

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ class LLM {
139139
llama_layers[i].filename = axmodel_path;
140140

141141
if (!attr.b_dynamic_load_axmodel_layer) {
142-
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
142+
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
143143
if (ret != 0) {
144144
ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
145145
return false;
@@ -162,7 +162,7 @@ class LLM {
162162
}
163163
}
164164

165-
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
165+
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
166166
if (ret != 0) {
167167
ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
168168
return false;
@@ -602,7 +602,7 @@ class LLM_CTX {
602602
sprintf(axmodel_path, attr.template_filename_axmodel.c_str(), i);
603603
llama_layers[i].filename = axmodel_path;
604604

605-
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
605+
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
606606
if (ret != 0) {
607607
ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
608608
return false;
@@ -612,7 +612,7 @@ class LLM_CTX {
612612
update_cqdm(&cqdm, i + 2, "count", axmodel_path);
613613
}
614614

615-
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
615+
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
616616
if (ret != 0) {
617617
ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
618618
return false;
@@ -810,9 +810,6 @@ class LLM_CTX {
810810

811811
layer.layer.inference(prefill_grpid);
812812

813-
auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
814-
auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
815-
816813
auto &input_prefill_k_cache = layer.layer.get_input(prefill_grpid, "K_cache");
817814
auto &input_prefill_v_cache = layer.layer.get_input(prefill_grpid, "V_cache");
818815

@@ -821,12 +818,6 @@ class LLM_CTX {
821818

822819
int kv_offset = (p * _attr.prefill_token_num) * _attr.kv_cache_size;
823820

824-
memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
825-
sizeof(unsigned short) * _attr.prefill_token_num * _attr.kv_cache_size);
826-
827-
memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
828-
sizeof(unsigned short) * _attr.prefill_token_num * _attr.kv_cache_size);
829-
830821
memcpy((unsigned short *)input_prefill_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
831822
sizeof(unsigned short) * _attr.prefill_token_num * _attr.kv_cache_size);
832823

projects/llm_framework/main_vlm/src/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ class llm_task {
263263
SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
264264
(base_model + std::string("tokenizer")).c_str(), prompt_.c_str());
265265

266-
std::this_thread::sleep_for(std::chrono::seconds(15));
266+
std::this_thread::sleep_for(std::chrono::seconds(5));
267267
};
268268

269269
auto process_field = [&](std::string &field, const char *name_for_log) -> bool {
@@ -291,7 +291,7 @@ class llm_task {
291291
model_type_ = ModelType::Qwen;
292292
else if (encoder_name.find("internvl3") != std::string::npos && mode_config_.precompute_len > 0)
293293
model_type_ = ModelType::InternVL_CTX;
294-
else if (encoder_name.find("internvl3") != std::string::npos)
294+
else if ((encoder_name.find("internvl3") != std::string::npos) || (encoder_name.find("vpm") != std::string::npos))
295295
model_type_ = ModelType::InternVL;
296296
else
297297
model_type_ = ModelType::Unknown;

projects/llm_framework/main_vlm/src/runner/LLM.hpp

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ class LLM {
155155
llama_layers[i].filename = axmodel_path;
156156

157157
if (!attr.b_dynamic_load_axmodel_layer) {
158-
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
158+
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
159159
if (ret != 0) {
160160
ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
161161
return false;
@@ -178,7 +178,7 @@ class LLM {
178178
}
179179
}
180180

181-
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
181+
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
182182
if (ret != 0) {
183183
ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
184184
return false;
@@ -188,13 +188,13 @@ class LLM {
188188
update_cqdm(&cqdm, attr.axmodel_num + 2, "count", axmodel_path);
189189

190190
if (_attr.b_vpm_two_stage) {
191-
ret = vpm_encoder.init(attr.filename_vpm_encoder_axmodel.c_str(), false);
191+
ret = vpm_encoder.init(attr.filename_vpm_encoder_axmodel.c_str(), true);
192192
if (ret != 0) {
193193
ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_encoder_axmodel.c_str());
194194
return false;
195195
}
196196

197-
ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), false);
197+
ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), true);
198198
if (ret != 0) {
199199
ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodedl.c_str());
200200
return false;
@@ -203,7 +203,7 @@ class LLM {
203203
_attr.vpm_height = vpm_encoder.get_input(0).vShape[1];
204204
_attr.vpm_width = vpm_encoder.get_input(0).vShape[2];
205205
} else {
206-
ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), false);
206+
ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), true);
207207
if (ret != 0) {
208208
ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodedl.c_str());
209209
return false;
@@ -716,7 +716,7 @@ class LLM_CTX {
716716
sprintf(axmodel_path, attr.template_filename_axmodel.c_str(), i);
717717
llama_layers[i].filename = axmodel_path;
718718

719-
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
719+
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
720720
if (ret != 0) {
721721
ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
722722
return false;
@@ -726,7 +726,7 @@ class LLM_CTX {
726726
update_cqdm(&cqdm, i + 2, "count", axmodel_path);
727727
}
728728

729-
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
729+
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
730730
if (ret != 0) {
731731
ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
732732
return false;
@@ -1773,7 +1773,7 @@ class LLM_Qwen {
17731773
llama_layers[i].filename = axmodel_path;
17741774

17751775
if (!attr.b_dynamic_load_axmodel_layer) {
1776-
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), false);
1776+
int ret = llama_layers[i].layer.init(llama_layers[i].filename.c_str(), true);
17771777
if (ret != 0) {
17781778
ALOGE("init axmodel(%s) failed", llama_layers[i].filename.c_str());
17791779
return false;
@@ -1796,7 +1796,7 @@ class LLM_Qwen {
17961796
}
17971797
}
17981798

1799-
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), false);
1799+
int ret = llama_post.init(attr.filename_post_axmodel.c_str(), true);
18001800
if (ret != 0) {
18011801
ALOGE("init post axmodel(%s) failed", attr.filename_post_axmodel.c_str());
18021802
return false;
@@ -1805,7 +1805,7 @@ class LLM_Qwen {
18051805
sprintf(axmodel_path, "init post axmodel ok,remain_cmm(%d MB)", remain_cmm);
18061806
update_cqdm(&cqdm, attr.axmodel_num + 2, "count", axmodel_path);
18071807

1808-
ret = image_encoder.init(attr.filename_image_encoder_axmodel.c_str(), false);
1808+
ret = image_encoder.init(attr.filename_image_encoder_axmodel.c_str(), true);
18091809
if (ret != 0) {
18101810
ALOGE("init image_encoder axmodel(%s) failed", attr.filename_image_encoder_axmodel.c_str());
18111811
return false;
@@ -2249,11 +2249,20 @@ class LLM_Qwen {
22492249

22502250
layer.layer.inference(_attr.prefill_grpid);
22512251

2252+
auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
2253+
auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
2254+
22522255
auto &output_k_cache = layer.layer.get_output(_attr.prefill_grpid, "K_cache_out");
22532256
auto &output_v_cache = layer.layer.get_output(_attr.prefill_grpid, "V_cache_out");
22542257

22552258
int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num) * _attr.kv_cache_size;
22562259

2260+
memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
2261+
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
2262+
2263+
memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
2264+
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
2265+
22572266
for (int gid = _attr.prefill_grpid + 1; gid < prefill_split_num + 1; gid++) {
22582267
auto &input_prefill_k_cache = layer.layer.get_input(gid, "K_cache");
22592268

0 commit comments

Comments
 (0)