@@ -1155,6 +1155,14 @@ struct llama_server_context
1155
1155
slot.has_next_token = false ;
1156
1156
}
1157
1157
1158
+ if (slot.n_past >= slot.n_ctx ) {
1159
+ slot.truncated = true ;
1160
+ slot.stopped_limit = true ;
1161
+ slot.has_next_token = false ;
1162
+
1163
+ LOG_VERBOSE (" stopped due to running out of context capacity" , {});
1164
+ }
1165
+
1158
1166
if (result.tok == llama_vocab_eos (vocab) || llama_vocab_is_eog (vocab, result.tok ))
1159
1167
{
1160
1168
slot.stopped_eos = true ;
@@ -1627,17 +1635,17 @@ struct llama_server_context
1627
1635
{
1628
1636
if (slot.is_processing () && system_tokens.size () + slot.cache_tokens .size () >= (size_t ) slot.n_ctx )
1629
1637
{
1638
+ // this check is redundant (for good)
1639
+ // we should never get here, because generation should already stopped in process_token()
1640
+
1630
1641
// START LOCALAI changes
1631
1642
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
1632
1643
// See: https://github.com/mudler/LocalAI/issues/1333
1633
1644
// Context is exhausted, release the slot
1634
1645
slot.release ();
1635
1646
send_final_response (slot);
1636
- slot.cache_tokens .clear ();
1637
- slot.n_past = 0 ;
1638
- slot.truncated = false ;
1639
- slot.has_next_token = true ;
1640
- LOG (" Context exhausted. Slot %d released (%d tokens in cache)\n " , slot.id , (int ) slot.cache_tokens .size ());
1647
+ slot.has_next_token = false ;
1648
+ LOG_ERROR (" context is exhausted, release the slot" , {});
1641
1649
1642
1650
continue ;
1643
1651
// END LOCALAI changes
0 commit comments