From d1052b5aabe1a3fbba5ce2f8e9e1c0c5d0082e18 Mon Sep 17 00:00:00 2001
From: Nick Hainke <vincent@systemli.org>
Date: Thu, 18 Jun 2026 14:02:02 +0200
Subject: [PATCH 1/2] Strip Gemma 4 thinking tokens from translation output

Gemma 4 emits thinking content in two forms:
- <|channel>thought\n...<channel|>answer (full block with closing tag)
- <|channel>thought answer (no closing tag, space-separated)

Handle both cases so thinking tokens never leak into the translation result.
---
 ltengine/src/llm.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ltengine/src/llm.rs b/ltengine/src/llm.rs
index 056b9b5..559d8a7 100644
--- a/ltengine/src/llm.rs
+++ b/ltengine/src/llm.rs
@@ -265,6 +265,17 @@ impl LLMContext<'_>{
             self.ctx.decode(&mut batch).with_context(|| "Failed to eval")?;
         }
 
+        // Gemma 4 thinking mode emits thinking content before the actual response in two forms:
+        // 1. <|channel>thought\n...<channel|>answer  (full block with closing tag)
+        // 2. <|channel>thought answer                (no closing tag, space-separated)
+        let output = if let Some(pos) = output.find("<channel|>") {
+            output[pos + "<channel|>".len()..].to_owned()
+        } else if let Some(rest) = output.strip_prefix("<|channel>thought") {
+            rest.trim_start_matches(['\n', ' ']).to_owned()
+        } else {
+            output
+        };
+
         // Gemma may emit <end_of_turn> as literal text when it cannot translate
         // (e.g. unsupported language/format combination) instead of the special
         // EOG token caught above. Strip it and treat empty output as an error.
@@ -273,6 +284,7 @@ impl LLMContext<'_>{
         if output.is_empty() {
             return Err(anyhow::anyhow!("Model produced empty output"));
         }
+
         Ok(output)
     }
 }

From 9e897d7d1300fd8baed59776743354088f742c43 Mon Sep 17 00:00:00 2001
From: Nick Hainke <vincent@systemli.org>
Date: Thu, 18 Jun 2026 20:18:49 +0200
Subject: [PATCH 2/2] Log chat template fallback failures

Emit a warning when apply_chat_template fails and ltengine falls
back to the hardcoded Gemma prompt format.
---
 ltengine/src/llm.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ltengine/src/llm.rs b/ltengine/src/llm.rs
index 559d8a7..fc98fc1 100644
--- a/ltengine/src/llm.rs
+++ b/ltengine/src/llm.rs
@@ -175,13 +175,17 @@ impl LLM {
         // Use the model's embedded chat template when llama.cpp can detect it.
         // Falls back to hardcoded Gemma format when detection fails (e.g. Gemma 4
         // until llama-cpp-sys picks up the upstream Gemma 4 template detection fix).
-        let llm_input = self.model
+        let llm_input = match self.model
             .chat_template(None)
             .ok()
             .and_then(|tmpl| self.model.apply_chat_template(&tmpl, &messages, true).ok())
-            .unwrap_or_else(|| format!(
-                "<start_of_turn>user\n{system}\n\n{user}<end_of_turn>\n<start_of_turn>model\n"
-            ));
+        {
+            Some(s) => s,
+            None => {
+                eprintln!("ltengine: apply_chat_template failed: using hardcoded Gemma format");
+                format!("<start_of_turn>user\n{system}\n\n{user}<end_of_turn>\n<start_of_turn>model\n")
+            }
+        };
 
         // BOS is not added by apply_chat_template — str_to_token handles it.
         let tokens_list = self.model