Skip to content

Commit 9e32fda

Browse files
authored
fix(llama.cpp): improve context shift handling (#4820)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 83202ca commit 9e32fda

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

backend/cpp/llama/grpc-server.cpp

+13-5
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,14 @@ struct llama_server_context
11551155
slot.has_next_token = false;
11561156
}
11571157

1158+
if (slot.n_past >= slot.n_ctx) {
1159+
slot.truncated = true;
1160+
slot.stopped_limit = true;
1161+
slot.has_next_token = false;
1162+
1163+
LOG_VERBOSE("stopped due to running out of context capacity", {});
1164+
}
1165+
11581166
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
11591167
{
11601168
slot.stopped_eos = true;
@@ -1627,17 +1635,17 @@ struct llama_server_context
16271635
{
16281636
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
16291637
{
1638+
// this check is redundant (for good)
1639+
// we should never get here, because generation should already stopped in process_token()
1640+
16301641
// START LOCALAI changes
16311642
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
16321643
// See: https://github.com/mudler/LocalAI/issues/1333
16331644
// Context is exhausted, release the slot
16341645
slot.release();
16351646
send_final_response(slot);
1636-
slot.cache_tokens.clear();
1637-
slot.n_past = 0;
1638-
slot.truncated = false;
1639-
slot.has_next_token = true;
1640-
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
1647+
slot.has_next_token = false;
1648+
LOG_ERROR("context is exhausted, release the slot", {});
16411649

16421650
continue;
16431651
// END LOCALAI changes

0 commit comments

Comments
 (0)