From d1052b5aabe1a3fbba5ce2f8e9e1c0c5d0082e18 Mon Sep 17 00:00:00 2001 From: Nick Hainke Date: Thu, 18 Jun 2026 14:02:02 +0200 Subject: [PATCH 1/2] Strip Gemma 4 thinking tokens from translation output Gemma 4 emits thinking content in two forms: - <|channel>thought\n...answer (full block with closing tag) - <|channel>thought answer (no closing tag, space-separated) Handle both cases so thinking tokens never leak into the translation result. --- ltengine/src/llm.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ltengine/src/llm.rs b/ltengine/src/llm.rs index 056b9b5..559d8a7 100644 --- a/ltengine/src/llm.rs +++ b/ltengine/src/llm.rs @@ -265,6 +265,17 @@ impl LLMContext<'_>{ self.ctx.decode(&mut batch).with_context(|| "Failed to eval")?; } + // Gemma 4 thinking mode emits thinking content before the actual response in two forms: + // 1. <|channel>thought\n...answer (full block with closing tag) + // 2. <|channel>thought answer (no closing tag, space-separated) + let output = if let Some(pos) = output.find("") { + output[pos + "".len()..].to_owned() + } else if let Some(rest) = output.strip_prefix("<|channel>thought") { + rest.trim_start_matches(['\n', ' ']).to_owned() + } else { + output + }; + // Gemma may emit as literal text when it cannot translate // (e.g. unsupported language/format combination) instead of the special // EOG token caught above. Strip it and treat empty output as an error. @@ -273,6 +284,7 @@ impl LLMContext<'_>{ if output.is_empty() { return Err(anyhow::anyhow!("Model produced empty output")); } + Ok(output) } } From 9e897d7d1300fd8baed59776743354088f742c43 Mon Sep 17 00:00:00 2001 From: Nick Hainke Date: Thu, 18 Jun 2026 20:18:49 +0200 Subject: [PATCH 2/2] Log chat template fallback failures Emit a warning when apply_chat_template fails and ltengine falls back to the hardcoded Gemma prompt format. --- ltengine/src/llm.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ltengine/src/llm.rs b/ltengine/src/llm.rs index 559d8a7..fc98fc1 100644 --- a/ltengine/src/llm.rs +++ b/ltengine/src/llm.rs @@ -175,13 +175,17 @@ impl LLM { // Use the model's embedded chat template when llama.cpp can detect it. // Falls back to hardcoded Gemma format when detection fails (e.g. Gemma 4 // until llama-cpp-sys picks up the upstream Gemma 4 template detection fix). - let llm_input = self.model + let llm_input = match self.model .chat_template(None) .ok() .and_then(|tmpl| self.model.apply_chat_template(&tmpl, &messages, true).ok()) - .unwrap_or_else(|| format!( - "user\n{system}\n\n{user}\nmodel\n" - )); + { + Some(s) => s, + None => { + eprintln!("ltengine: apply_chat_template failed: using hardcoded Gemma format"); + format!("user\n{system}\n\n{user}\nmodel\n") + } + }; // BOS is not added by apply_chat_template — str_to_token handles it. let tokens_list = self.model