@@ -64,6 +64,7 @@ Llama::Llama() :
6464 _n_system_tokens(0 ),
6565 _is_gemma4(false ),
6666 _sampler_dirty(false ),
67+ _can_shift(false ),
6768 _seed(LLAMA_DEFAULT_SEED ) {
6869 llama_log_set ([](enum ggml_log_level level, const char *text, void *user_data) {
6970 Llama *llama = (Llama *)user_data;
@@ -103,6 +104,7 @@ Llama::Llama(Llama &&other) noexcept
103104 , _n_system_tokens(other._n_system_tokens)
104105 , _is_gemma4(other._is_gemma4)
105106 , _sampler_dirty(other._sampler_dirty)
107+ , _can_shift(other._can_shift)
106108 , _seed(other._seed) {
107109}
108110
@@ -179,9 +181,10 @@ bool Llama::load_model(string model_path, int n_ctx, int n_batch, int n_gpu_laye
179181 set_last_error (" Create context" );
180182 } else {
181183 _vocab = llama_model_get_vocab (_model);
184+ _template = llama_model_chat_template (_model, nullptr );
185+ _is_gemma4 = (_template.find (" <|turn>model" ) != string::npos);
186+ _can_shift = llama_memory_can_shift (llama_get_memory (_ctx));
182187 }
183- _template = llama_model_chat_template (_model, nullptr );
184- _is_gemma4 = (_template.find (" <|turn>model" ) != string::npos);
185188 }
186189
187190 return _last_error.empty ();
@@ -579,6 +582,10 @@ bool Llama::make_space_for_tokens(int n_tokens) {
579582 _last_error = " Can't make enough space while keeping num_system_tokens tokens" ;
580583 return false ;
581584 }
585+ if (!_can_shift) {
586+ _last_error = " Memory type doesn't support shifting, can't evict mid-sequence" ;
587+ return false ;
588+ }
582589
583590 llama_pos remove_start = pos_min + _n_system_tokens;
584591
0 commit comments