@@ -155,7 +155,7 @@ class LLM {
155155 llama_layers[i].filename = axmodel_path;
156156
157157 if (!attr.b_dynamic_load_axmodel_layer ) {
158- int ret = llama_layers[i].layer .init (llama_layers[i].filename .c_str (), false );
158+ int ret = llama_layers[i].layer .init (llama_layers[i].filename .c_str (), true );
159159 if (ret != 0 ) {
160160 ALOGE (" init axmodel(%s) failed" , llama_layers[i].filename .c_str ());
161161 return false ;
@@ -178,7 +178,7 @@ class LLM {
178178 }
179179 }
180180
181- int ret = llama_post.init (attr.filename_post_axmodel .c_str (), false );
181+ int ret = llama_post.init (attr.filename_post_axmodel .c_str (), true );
182182 if (ret != 0 ) {
183183 ALOGE (" init post axmodel(%s) failed" , attr.filename_post_axmodel .c_str ());
184184 return false ;
@@ -188,13 +188,13 @@ class LLM {
188188 update_cqdm (&cqdm, attr.axmodel_num + 2 , " count" , axmodel_path);
189189
190190 if (_attr.b_vpm_two_stage ) {
191- ret = vpm_encoder.init (attr.filename_vpm_encoder_axmodel .c_str (), false );
191+ ret = vpm_encoder.init (attr.filename_vpm_encoder_axmodel .c_str (), true );
192192 if (ret != 0 ) {
193193 ALOGE (" init vpm axmodel(%s) failed" , attr.filename_vpm_encoder_axmodel .c_str ());
194194 return false ;
195195 }
196196
197- ret = vpm_resampler.init (attr.filename_vpm_resampler_axmodedl .c_str (), false );
197+ ret = vpm_resampler.init (attr.filename_vpm_resampler_axmodedl .c_str (), true );
198198 if (ret != 0 ) {
199199 ALOGE (" init vpm axmodel(%s) failed" , attr.filename_vpm_resampler_axmodedl .c_str ());
200200 return false ;
@@ -203,7 +203,7 @@ class LLM {
203203 _attr.vpm_height = vpm_encoder.get_input (0 ).vShape [1 ];
204204 _attr.vpm_width = vpm_encoder.get_input (0 ).vShape [2 ];
205205 } else {
206- ret = vpm_resampler.init (attr.filename_vpm_resampler_axmodedl .c_str (), false );
206+ ret = vpm_resampler.init (attr.filename_vpm_resampler_axmodedl .c_str (), true );
207207 if (ret != 0 ) {
208208 ALOGE (" init vpm axmodel(%s) failed" , attr.filename_vpm_resampler_axmodedl .c_str ());
209209 return false ;
@@ -716,7 +716,7 @@ class LLM_CTX {
716716 sprintf (axmodel_path, attr.template_filename_axmodel .c_str (), i);
717717 llama_layers[i].filename = axmodel_path;
718718
719- int ret = llama_layers[i].layer .init (llama_layers[i].filename .c_str (), false );
719+ int ret = llama_layers[i].layer .init (llama_layers[i].filename .c_str (), true );
720720 if (ret != 0 ) {
721721 ALOGE (" init axmodel(%s) failed" , llama_layers[i].filename .c_str ());
722722 return false ;
@@ -726,7 +726,7 @@ class LLM_CTX {
726726 update_cqdm (&cqdm, i + 2 , " count" , axmodel_path);
727727 }
728728
729- int ret = llama_post.init (attr.filename_post_axmodel .c_str (), false );
729+ int ret = llama_post.init (attr.filename_post_axmodel .c_str (), true );
730730 if (ret != 0 ) {
731731 ALOGE (" init post axmodel(%s) failed" , attr.filename_post_axmodel .c_str ());
732732 return false ;
@@ -1773,7 +1773,7 @@ class LLM_Qwen {
17731773 llama_layers[i].filename = axmodel_path;
17741774
17751775 if (!attr.b_dynamic_load_axmodel_layer ) {
1776- int ret = llama_layers[i].layer .init (llama_layers[i].filename .c_str (), false );
1776+ int ret = llama_layers[i].layer .init (llama_layers[i].filename .c_str (), true );
17771777 if (ret != 0 ) {
17781778 ALOGE (" init axmodel(%s) failed" , llama_layers[i].filename .c_str ());
17791779 return false ;
@@ -1796,7 +1796,7 @@ class LLM_Qwen {
17961796 }
17971797 }
17981798
1799- int ret = llama_post.init (attr.filename_post_axmodel .c_str (), false );
1799+ int ret = llama_post.init (attr.filename_post_axmodel .c_str (), true );
18001800 if (ret != 0 ) {
18011801 ALOGE (" init post axmodel(%s) failed" , attr.filename_post_axmodel .c_str ());
18021802 return false ;
@@ -1805,7 +1805,7 @@ class LLM_Qwen {
18051805 sprintf (axmodel_path, " init post axmodel ok,remain_cmm(%d MB)" , remain_cmm);
18061806 update_cqdm (&cqdm, attr.axmodel_num + 2 , " count" , axmodel_path);
18071807
1808- ret = image_encoder.init (attr.filename_image_encoder_axmodel .c_str (), false );
1808+ ret = image_encoder.init (attr.filename_image_encoder_axmodel .c_str (), true );
18091809 if (ret != 0 ) {
18101810 ALOGE (" init image_encoder axmodel(%s) failed" , attr.filename_image_encoder_axmodel .c_str ());
18111811 return false ;
@@ -2249,11 +2249,20 @@ class LLM_Qwen {
22492249
22502250 layer.layer .inference (_attr.prefill_grpid );
22512251
2252+ auto &input_decoder_k_cache = layer.layer .get_input (decode_grpid, " K_cache" );
2253+ auto &input_decoder_v_cache = layer.layer .get_input (decode_grpid, " V_cache" );
2254+
22522255 auto &output_k_cache = layer.layer .get_output (_attr.prefill_grpid , " K_cache_out" );
22532256 auto &output_v_cache = layer.layer .get_output (_attr.prefill_grpid , " V_cache_out" );
22542257
22552258 int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num ) * _attr.kv_cache_size ;
22562259
2260+ memcpy ((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr ,
2261+ sizeof (unsigned short ) * input_num_token * _attr.kv_cache_size );
2262+
2263+ memcpy ((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr ,
2264+ sizeof (unsigned short ) * input_num_token * _attr.kv_cache_size );
2265+
22572266 for (int gid = _attr.prefill_grpid + 1 ; gid < prefill_split_num + 1 ; gid++) {
22582267 auto &input_prefill_k_cache = layer.layer .get_input (gid, " K_cache" );
22592268
0 commit comments