Add return_attention support to Generator

QuentinFuxa · QuentinFuxa · commit 948f8f2677e3 · 2026-02-07T12:55:00.000+01:00
The decoding engine already computes attention weights when requested,
but this was only wired through the Translator API. This exposes the
same capability for decoder-only models (Generator) by propagating
the return_attention flag from GenerationOptions to DecodingOptions
and transferring the attention data back to GenerationResult.
diff --git a/include/ctranslate2/generation.h b/include/ctranslate2/generation.h
@@ -53,6 +53,8 @@ namespace ctranslate2 {
 
     // Include scores in the result.
     bool return_scores = false;
+    // Store attention vectors in the GenerationResult class.
+    bool return_attention = false;
     // Include log probs of each token in the result
     bool return_logits_vocab = false;
 
@@ -81,6 +83,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> sequences;
     std::vector<std::vector<size_t>> sequences_ids;
     std::vector<float> scores;
+    std::vector<std::vector<std::vector<float>>> attention;
     std::vector<std::vector<StorageView>> logits;
 
     size_t num_sequences() const {
diff --git a/python/cpp/generation_result.cc b/python/cpp/generation_result.cc
@@ -49,13 +49,16 @@ namespace ctranslate2 {
                       "Generated sequences of token IDs.")
         .def_readonly("scores", &GenerationResult::scores,
                       "Score of each sequence (empty if :obj:`return_scores` was disabled).")
+        .def_readonly("attention", &GenerationResult::attention,
+                      "Attention matrix of each sequence (empty if :obj:`return_attention` was disabled).")
         .def_readonly("logits", &GenerationResult::logits,
                       "Logits of each sequence (empty if :obj:`return_logits_vocab` was disabled).")
 
         .def("__repr__", [](const GenerationResult& result) {
           return "GenerationResult(sequences=" + std::string(py::repr(py::cast(result.sequences)))
             + ", sequences_ids=" + std::string(py::repr(py::cast(result.sequences_ids)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
+            + ", attention=" + std::string(py::repr(py::cast(result.attention)))
             + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ")";
         })
diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
@@ -33,6 +33,7 @@ namespace ctranslate2 {
                      bool cache_static_prompt,
                      bool include_prompt_in_result,
                      bool return_scores,
+                     bool return_attention,
                      bool return_logits_vocab,
                      bool return_alternatives,
                      float min_alternative_expansion_prob,
@@ -59,6 +60,7 @@ namespace ctranslate2 {
         options.num_hypotheses = num_hypotheses;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
+        options.return_attention = return_attention;
         options.return_logits_vocab = return_logits_vocab;
         options.return_alternatives = return_alternatives;
         options.cache_static_prompt = cache_static_prompt;
@@ -205,6 +207,7 @@ namespace ctranslate2 {
              py::arg("cache_static_prompt")=true,
              py::arg("include_prompt_in_result")=true,
              py::arg("return_scores")=false,
+             py::arg("return_attention")=false,
              py::arg("return_logits_vocab")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
@@ -263,6 +266,7 @@ namespace ctranslate2 {
                      reuse it for future generations using the same static prompt.
                    include_prompt_in_result: Include the :obj:`start_tokens` in the result.
                    return_scores: Include the scores in the output.
+                   return_attention: Include the attention matrices in the output.
                    return_logits_vocab: Include log probs for each token in the output
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.
diff --git a/src/models/language_model.cc b/src/models/language_model.cc
@@ -165,6 +165,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
+      decoding_options.return_attention = options.return_attention;
       decoding_options.return_logits_vocab = options.return_logits_vocab;
       decoding_options.return_alternatives = options.return_alternatives;
       decoding_options.min_alternative_expansion_prob = options.min_alternative_expansion_prob;
@@ -251,9 +252,13 @@ namespace ctranslate2 {
 
         // Remove EOS token.
         if (!options.return_end_token) {
-          for (auto& sequence : result.hypotheses) {
-            while (!sequence.empty() && is_eos(sequence.back(), end_ids))
-              sequence.pop_back();
+          for (size_t h = 0; h < result.hypotheses.size(); ++h) {
+            while (!result.hypotheses[h].empty()
+                   && is_eos(result.hypotheses[h].back(), end_ids)) {
+              result.hypotheses[h].pop_back();
+              if (!result.attention.empty())
+                result.attention[h].pop_back();
+            }
           }
         }
 
@@ -269,6 +274,7 @@ namespace ctranslate2 {
         final_result.sequences = vocabulary.to_tokens(result.hypotheses);
         final_result.sequences_ids = std::move(result.hypotheses);
         final_result.scores = std::move(result.scores);
+        final_result.attention = std::move(result.attention);
         final_result.logits = std::move(result.logits_vocab);
         final_results.emplace_back(std::move(final_result));
       }