server : return tokens ids only if requested

ggml-ci
ggerganov · ggerganov · Dec 18, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 17, 2024
commit 8bcfc5551efcc00b38b253810d9913d33c690083
@@ -438,6 +438,8 @@ These words will not be included in the completion, so make sure to add them to
 
 `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
 
+`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
+
 `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
 `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
@@ -451,7 +453,7 @@ These words will not be included in the completion, so make sure to add them to
 ```json
 {
   "content": "<the token generated by the model>",
-  "tokens": [ generated token ids ],
+  "tokens": [ generated token ids if requested ],
   "probs": [
     {
       "prob": float,
@@ -469,7 +471,7 @@ These words will not be included in the completion, so make sure to add them to
 Notice that each `probs` is an array of length `n_probs`.
 
 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
-- `tokens`: Same as `content` but represented as raw token ids.
+- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
 - `model`: The path to the model loaded with `-m`

@@ -79,8 +79,9 @@ enum error_type {
 };
 
 struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+    bool stream        = true;
+    bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
+    bool return_tokens = false;
 
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -199,6 +200,7 @@ struct server_task {
 
         params.stream           = json_value(data, "stream",             false);
         params.cache_prompt     = json_value(data, "cache_prompt",       true);
+        params.return_tokens    = json_value(data, "return_tokens",      false);
         params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
         params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
         params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
@@ -543,7 +545,7 @@ struct server_task_result_cmpl_final : server_task_result {
         json choices = json::array({json{
             {"finish_reason", finish_reason},
             {"index", 0},
-            {"message", json{
+            {"message", json {
                 {"content", content},
                 {"tokens",  tokens},
                 {"role",    "assistant"}
@@ -998,7 +1000,6 @@ struct server_slot {
         n_prompt_tokens    = 0;
         last_nl_pos        = 0;
         generated_text     = "";
-        generated_tokens   = {};
         has_new_line       = false;
         truncated          = false;
         stop               = STOP_TYPE_NONE;
@@ -1008,6 +1009,7 @@ struct server_slot {
         n_sent_token_probs = 0;
         task_type          = SERVER_TASK_TYPE_COMPLETION;
 
+        generated_tokens.clear();
         generated_token_probs.clear();
     }
 
@@ -1748,9 +1750,10 @@ struct server_context {
         const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
         slot.sampled = result.tok;
 
-        // search stop word and delete it
         slot.generated_text += token_str;
-        slot.generated_tokens.push_back(result.tok);
+        if (slot.params.return_tokens) {
+            slot.generated_tokens.push_back(result.tok);
+        }
         slot.has_next_token = true;
 
         // check if there is incomplete UTF-8 character at the end
@@ -1775,6 +1778,7 @@ struct server_context {
             break;
         }
 
+        // search stop word and delete it
         if (!incomplete) {
             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
 

@@ -10,23 +10,28 @@ def create_server():
     global server
     server = ServerPreset.tinyllama2()
 
-@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
-    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
-    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
 ])
-def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
     global server
     server.start()
     res = server.make_request("POST", "/completion", data={
         "n_predict": n_predict,
         "prompt": prompt,
+        "return_tokens": return_tokens,
     })
     assert res.status_code == 200
     assert res.body["timings"]["prompt_n"] == n_prompt
     assert res.body["timings"]["predicted_n"] == n_predicted
     assert res.body["truncated"] == truncated
     assert type(res.body["has_new_line"]) == bool
     assert match_regex(re_content, res.body["content"])
+    if return_tokens:
+        assert res.body["tokens"] != []
+    else:
+        assert res.body["tokens"] == []
 
 
 @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
@@ -56,6 +61,7 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
             assert data["generation_settings"]["seed"] == server.seed
             assert match_regex(re_content, content)
         else:
+            assert data["tokens"] != []
             content += data["content"]